[Mlir-commits] [mlir] Add a tutorial on mlir-opt (PR #96105)

Fri Jul 26 17:10:34 PDT 2024

https://github.com/j2kun updated https://github.com/llvm/llvm-project/pull/96105

>From f78800ed9e62bd43f5bbda5bed6395801db7bc68 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:29:23 -0700
Subject: [PATCH 01/19] start writing mlir-opt tutorial

---
 mlir/docs/Tutorials/MlirOpt.md | 200 +++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 mlir/docs/Tutorials/MlirOpt.md

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
new file mode 100644
index 0000000000000..dea61fc81a38a
--- /dev/null
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -0,0 +1,200 @@
+# Using `mlir-opt`
+
+`mlir-opt` is the command-line entry point for running passes and lowerings on MLIR code.
+This tutorial will explain how to use `mlir-opt` to run passes, and explain
+some details about MLIR's built-in dialects along the way.
+
+Prerequisites:
+
+- [Building MLIR from source](/getting_started/)
+
+[TOC]
+
+## Overview
+
+We start with a brief summary of context that helps to frame
+the uses of `mlir-opt` detailed in this article.
+For a deeper dive on motivation and design,
+see [the MLIR paper](https://arxiv.org/abs/2002.11054).
+
+Two of the central concepts in MLIR are *dialects* and *lowerings*.
+In traditional compilers, there is typically one "dialect,"
+called an *intermediate representation*, or IR,
+that is the textual or data-structural description of a program
+within the scope of the compiler's execution.
+For example, in GCC the IR is called GIMPLE,
+and in LLVM it's called LLVM-IR.
+Compilers typically convert an input program to their IR,
+run optimization passes,
+and then convert the optimized IR to machine code.
+
+MLIR's philosophy is to split the job into smaller steps.
+First, MLIR allows one to define many IRs called *dialects*,
+some considered "high level" and some "low level,"
+but each with a set of types, operations, metadata,
+and semantics that defines what the operations do.
+Different dialects may coexist in the same program.
+Then, one writes a set of *lowering passes*
+that incrementally converts different parts of the program
+from higher level dialects to lower and lower dialects
+until you get to machine code
+(or, in many cases, LLVM, which finishes the job).
+Along the way,
+*optimizing passes* are run to make the code more efficient.
+The main point here is that the high level dialects exist
+*so that* they make it easy to write these important optimizing passes.
+
+The main motivation for building MLIR
+was to build the `affine` dialect,
+which is designed to enable [polyhedral optimizations](https://polyhedral.info/)
+for loop transformations.
+Compiler engineers had previously implemented polyhedral optimizations in LLVM and GCC (without an `affine` dialect),
+and it was difficult because they had to take a reconstruct a well-structured loop nest
+from a much more complicated set of low-level operations.
+MLIR instead keeps the structure in the higher level operations for optimizations,
+and then discards it during lowering passes.
+
+The `mlir-opt` tool can run both
+optimization passes and lowerings,
+though the final code generation
+is performed by a different tool called `mlir-translate`.
+In particular, `mlir-opt` consumes MLIR as input and produce MLIR as output,
+while `mlir-translate` consumes MLIR as input
+and produces non-MLIR program representations as output.
+
+## Two example programs
+
+Here are two MLIR programs
+that define a function that counts the leading zeroes of a 32-bit integer (`i32`).
+The first uses the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation and just returns the result.
+
+```mlir
+func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}
+```
+
+This shows the basic structure of an MLIR operation
+([see here](https://mlir.llvm.org/docs/LangRef/#operations) for a more complete spec).
+Variable names are prefixed with `%`,
+functions by `@`,
+and each variable/value in a program has a type,
+often expressed after a colon.
+In this case all the types are `i32`,
+except for the function type which is `(i32) -> i32`
+(not specified explicitly above, but you'll see it in the `func.call` later).
+
+Each statement is anchored around an expression like `math.ctlz`
+which specifies the dialect [`math`](https://mlir.llvm.org/docs/Dialects/MathOps/) via a namespace,
+and the operation [`ctlz`](https://mlir.llvm.org/docs/Dialects/MathOps/#mathctlz-mathcountleadingzerosop) after the `.`.
+The rest of the syntax of the operation
+is determined by a parser defined by the dialect,
+and so many operations will have different syntaxes.
+In the case of `math.ctlz`,
+the sole argument is an integer whose leading zeros are to be counted,
+and the trailing ` : i32` denotes the output type storing the count.
+
+It's important to note that [`func`](https://mlir.llvm.org/docs/Dialects/Func/) is itself a dialect,
+and [`func.func`](https://mlir.llvm.org/docs/Dialects/Func/#funcfunc-funcfuncop) is considered an "operation,"
+where the braces and the function's body is part of the syntax.
+In MLIR a list of operations within braces is called a [*region*](https://mlir.llvm.org/docs/LangRef/#regions),
+and an operation can have zero regions like `math.ctlz`,
+one region like `func.func`,
+or multiple regions like [`scf.if`](https://mlir.llvm.org/docs/Dialects/SCFDialect/#scfif-scfifop),
+which has a region for each of its two control flow branches.
+
+The second program is a sequence of two loops
+that exhibit poor cache locality.
+
+```mlir
+
+```
+
+## Lowering `ctlz`
+
+The second version of this program has a software implementation of the `ctlz` function and calls it.
+
+```mlir
+func.func @main(%arg0: i32) -> i32 {
+  %0 = func.call @my_ctlz(%arg0) : (i32) -> i32
+  func.return %0 : i32
+}
+func.func @my_ctlz(%arg0: i32) -> i32 {
+  %c32_i32 = arith.constant 32 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %0 = arith.cmpi eq, %arg0, %c0_i32 : i32
+  %1 = scf.if %0 -> (i32) {
+    scf.yield %c32_i32 : i32
+  } else {
+    %c1 = arith.constant 1 : index
+    %c1_i32 = arith.constant 1 : i32
+    %c32 = arith.constant 32 : index
+    %c0_i32_0 = arith.constant 0 : i32
+    %2:2 = scf.for %arg1 = %c1 to %c32 step %c1 iter_args(%arg2 = %arg0, %arg3 = %c0_i32_0) -> (i32, i32) {
+      %3 = arith.cmpi slt, %arg2, %c0_i32 : i32
+      %4:2 = scf.if %3 -> (i32, i32) {
+        scf.yield %arg2, %arg3 : i32, i32
+      } else {
+        %5 = arith.addi %arg3, %c1_i32 : i32
+        %6 = arith.shli %arg2, %c1_i32 : i32
+        scf.yield %6, %5 : i32, i32
+      }
+      scf.yield %4#0, %4#1 : i32, i32
+    }
+    scf.yield %2#1 : i32
+  }
+  func.return %1 : i32
+}
+```
+
+The algorithm above is not relevant to this post, but either way it is quite simple: count the leading zeros by shifting the input left one bit at a time until it becomes negative (as a signed integer), because that occurs exactly when its leading bit is a 1. Then add a special case to handle zero, which would loop infinitely otherwise.
+
+Here you can see two more MLIR dialects. [`arith`](https://mlir.llvm.org/docs/Dialects/ArithOps/) is for low-level arithmetic and boolean conditions on integers and floats. You can define constants, compare integers with `arith.cmpi`, and do things like add and bit shift (`arith.shli` is a left shift). [`scf`](https://mlir.llvm.org/docs/Dialects/SCFDialect/), short for "structured control flow," defines for loops, while loops, and control flow branching. `scf.yield` defines the "output" value from each region of an if/else operation or loop body which is necessary here because, as you can see, an `if` operation has a result value.
+
+Two other minor aspects of the syntax are on display. First is the syntax `%4:2`, which defines a variable `%4` which is a tuple of two values. The corresponding `%4#1` accesses the second entry in the tuple. Second, you'll notice there's a type called `index` that is different from `i32`. Though they both represent integers, `index` is intended to be a platform-dependent integer type which is suitable for indexing arrays, representing sizes and dimensions of things, and, in our case, being loop counters and iteration bounds. More details on [`index` in the MLIR docs](https://mlir.llvm.org/docs/Rationale/Rationale/#integer-signedness-semantics).
+
+## Lowerings and the math-to-funcs pass
+
+We have two versions of the same program because one is a lowered version of the other. In most cases, the machine you're going to run a program on has a "count leading zeros" function, so the lowering would simply map `math.ctlz` to the corresponding machine instruction. But if there is no `ctlz` instruction, a lowering can provide an implementation in terms of lower level dialects and ops. Specifically, this one lowers ctlz to {`func`, `arith`, `scf`}.
+
+The second version of this code was actually generated by the `mlir-opt` command line tool, which is the main entry-point to running MLIR passes on specific MLIR code. For starters, one can take the `mlir-opt` tool and run it with no arguments on any MLIR code, and it will parse it, verify it is well formed, and print it back out with some slight normalizations. In this case, it will wrap the code in a `module`, which is a namespace isolation mechanism.
+
+```bash
+$ echo 'func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}' > ctlz.mlir
+$ bazel run @llvm-project//mlir:mlir-opt -- $(pwd)/ctlz.mlir
+<... snip ...>
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = math.ctlz %arg0 : i32
+    return %0 : i32
+  }
+}
+```
+
+**Aside:** The `-- $(pwd)/ctlz.mlir` is a quirk of bazel. When one program runs another program, the `--` is the standard mechanism to separate CLI flags from the runner program (`bazel`) and the run program (`mlir-opt`). Everything after `--` goes to `mlir-opt`. Also, the need for `$(pwd)` is because when bazel runs `mlir-opt`, it runs it with a working directory that is in some temporary, isolated location on the filesystem. So we need to give it an absolute path to the MLIR file to input. Or we could pipe from standard in. Or we could run the `mlir-opt` binary directly from `bazel-bin/external/llvm-project/mlir/mlir-opt`.
+
+Next we can run our first lowering, which is already built-in to `mlir-opt`, and which generates the long program above.
+
+```bash
+$ bazel run @llvm-project//mlir:mlir-opt -- --convert-math-to-funcs=convert-ctlz $(pwd)/ctlz.mlir
+<... snip ...>
+module {
+  func.func @main(%arg0: i32) {
+    %0 = call @__mlir_math_ctlz_i32(%arg0) : (i32) -> i32
+    return
+  }
+  func.func private @__mlir_math_ctlz_i32(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+<... snip ...>
+```
+
+Each pass gets its own command line flag, some are grouped into pipelines, and the `--pass-pipeline` command line flag can be used to provide a (serialized version of) an ordered list of passes to run on the input MLIR.1
+
+We won't cover the internal workings of the `math-to-funcs` pass in this or a future article, but next time we will actually write our own, simpler pass that does something nontrivial. Until then, I'll explain a bit about how testing works in MLIR, using these two ctlz programs as example test cases.
+
+For those who are interested, the MLIR documentation contains a [complete list of passes](https://mlir.llvm.org/docs/Passes/) owned by the upstream MLIR project, which can be used by invoking the corresponding command line flag or nesting it inside of a larger `--pass-pipeline`.
+
+## Optimizing `affine.for`

>From 105b1ffa6be81ea4e1a61390fdd157858cbb3bc4 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Tue, 30 Apr 2024 09:58:57 -0700
Subject: [PATCH 02/19] find the right program for affine-loop-fusion

---
 mlir/docs/Tutorials/MlirOpt.md | 41 ++++++++++++++++++++++++++++++----
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index dea61fc81a38a..c4a72948f8b79 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -104,14 +104,47 @@ one region like `func.func`,
 or multiple regions like [`scf.if`](https://mlir.llvm.org/docs/Dialects/SCFDialect/#scfif-scfifop),
 which has a region for each of its two control flow branches.
 
-The second program is a sequence of two loops
-that exhibit poor cache locality.
+The second program is a sequence of loops
+that exhibits poor cache locality.
 
 ```mlir
-
+func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+  %0 = memref.alloc() : memref<10xf32>
+  %1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %arg2 = 0 to 10 {
+    affine.store %cst, %0[%arg2] : memref<10xf32>
+    affine.store %cst, %1[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %0[%arg2] : memref<10xf32>
+    %3 = arith.addf %2, %2 : f32
+    affine.store %3, %arg0[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %1[%arg2] : memref<10xf32>
+    %3 = arith.mulf %2, %2 : f32
+    affine.store %3, %arg1[%arg2] : memref<10xf32>
+  }
+  return
+}
 ```
 
-## Lowering `ctlz`
+This program introduces some additional dialects.
+The [`affine` dialect](https://mlir.llvm.org/docs/Dialects/Affine/) mentioned in the introduction
+represents well-structured loop nests,
+and the [`affine.for` operation](https://mlir.llvm.org/docs/Dialects/Affine/#affinefor-affineaffineforop)
+whose region corresponds to the loop body.
+`affine.for` also showcases some custom-defined syntax
+to represent the loop bounds and loop induction variable.
+The [`memref` dialect](https://mlir.llvm.org/docs/Dialects/MemRef/)
+defines types and operations related to memory management
+with pointer semantics.
+Note also that while `memref` has store and load operations,
+`affine` has its own that limit what types of memory accesses are allowed,
+so as to ensure the well-structuredness of the loop nest.
+
+## Lowering `ctlz` in two ways
 
 The second version of this program has a software implementation of the `ctlz` function and calls it.
 

>From 818961d80ce2ff5a1692a8c4fa698a181ccbbd19 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Fri, 31 May 2024 22:04:01 -0700
Subject: [PATCH 03/19] finish first draft of tutorial

---
 mlir/docs/Tutorials/MlirOpt.md               | 323 +++++++++++++++----
 mlir/test/Examples/mlir-opt/ctlz.mlir        |   4 +
 mlir/test/Examples/mlir-opt/loop-fusion.mlir |  20 ++
 3 files changed, 284 insertions(+), 63 deletions(-)
 create mode 100644 mlir/test/Examples/mlir-opt/ctlz.mlir
 create mode 100644 mlir/test/Examples/mlir-opt/loop-fusion.mlir

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index c4a72948f8b79..f8a4f312a87dd 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -44,14 +44,16 @@ Along the way,
 The main point here is that the high level dialects exist
 *so that* they make it easy to write these important optimizing passes.
 
-The main motivation for building MLIR
+A central motivation for building MLIR
 was to build the `affine` dialect,
 which is designed to enable [polyhedral optimizations](https://polyhedral.info/)
 for loop transformations.
-Compiler engineers had previously implemented polyhedral optimizations in LLVM and GCC (without an `affine` dialect),
-and it was difficult because they had to take a reconstruct a well-structured loop nest
+Compiler engineers had previously implemented polyhedral optimizations
+in LLVM and GCC (without an `affine` dialect),
+and it was difficult because they had to reconstruct well-structured loop nests
 from a much more complicated set of low-level operations.
-MLIR instead keeps the structure in the higher level operations for optimizations,
+Having a higher level `affine` dialect preserves the loop nest structure
+at an abstraction layer that makes it easier to write optimizations,
 and then discards it during lowering passes.
 
 The `mlir-opt` tool can run both
@@ -64,9 +66,9 @@ and produces non-MLIR program representations as output.
 
 ## Two example programs
 
-Here are two MLIR programs
-that define a function that counts the leading zeroes of a 32-bit integer (`i32`).
-The first uses the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation and just returns the result.
+Here are two MLIR programs.
+The first defines a function that counts the leading zeroes of a 32-bit integer (`i32`)
+using the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation.
 
 ```mlir
 func.func @main(%arg0: i32) -> i32 {
@@ -96,7 +98,7 @@ the sole argument is an integer whose leading zeros are to be counted,
 and the trailing ` : i32` denotes the output type storing the count.
 
 It's important to note that [`func`](https://mlir.llvm.org/docs/Dialects/Func/) is itself a dialect,
-and [`func.func`](https://mlir.llvm.org/docs/Dialects/Func/#funcfunc-funcfuncop) is considered an "operation,"
+and [`func.func`](https://mlir.llvm.org/docs/Dialects/Func/#funcfunc-funcfuncop) is an operation,
 where the braces and the function's body is part of the syntax.
 In MLIR a list of operations within braces is called a [*region*](https://mlir.llvm.org/docs/LangRef/#regions),
 and an operation can have zero regions like `math.ctlz`,
@@ -134,7 +136,7 @@ This program introduces some additional dialects.
 The [`affine` dialect](https://mlir.llvm.org/docs/Dialects/Affine/) mentioned in the introduction
 represents well-structured loop nests,
 and the [`affine.for` operation](https://mlir.llvm.org/docs/Dialects/Affine/#affinefor-affineaffineforop)
-whose region corresponds to the loop body.
+whose region corresponds to the loop's body.
 `affine.for` also showcases some custom-defined syntax
 to represent the loop bounds and loop induction variable.
 The [`memref` dialect](https://mlir.llvm.org/docs/Dialects/MemRef/)
@@ -144,90 +146,285 @@ Note also that while `memref` has store and load operations,
 `affine` has its own that limit what types of memory accesses are allowed,
 so as to ensure the well-structuredness of the loop nest.
 
-## Lowering `ctlz` in two ways
+## Running `mlir-opt`
 
-The second version of this program has a software implementation of the `ctlz` function and calls it.
+After building the MLIR project,
+the `mlir-opt` binary (located in `build/bin`)
+is the entry point for running passes and lowerings,
+as well as emitting debug and diagnostic data.
+
+Running `mlir-opt` with no flags will consume MLIR input
+from standard in, parse and run verifiers on it,
+and write the MLIR back to standard out.
+This is a good way to test if an input MLIR is well-formed.
+
+`mlir-opt --help` shows a complete list of flags
+(there are nearly 1000).
+Each pass gets its own flag.
+
+## Lowering `ctlz`
+
+Next we will show two of MLIR's lowering passes.
+The first, `convert-math-to-llvm`, converts the `ctlz` op
+to the `llvm` dialect's [`intr.ctlz` op](https://mlir.llvm.org/docs/Dialects/LLVM/#llvmintrctlz-llvmcountleadingzerosop)
+which is an LLVM intrinsic.
+Note that `llvm` here is MLIR's `llvm` dialect,
+which would still need to be processed through `mlir-translate`
+to generate LLVM-IR.
+
+Recall our ctlz program:
 
 ```mlir
+# mlir/test/Examples/mlir-opt/ctlz.mlir
 func.func @main(%arg0: i32) -> i32 {
-  %0 = func.call @my_ctlz(%arg0) : (i32) -> i32
+  %0 = math.ctlz %arg0 : i32
   func.return %0 : i32
 }
-func.func @my_ctlz(%arg0: i32) -> i32 {
-  %c32_i32 = arith.constant 32 : i32
-  %c0_i32 = arith.constant 0 : i32
-  %0 = arith.cmpi eq, %arg0, %c0_i32 : i32
-  %1 = scf.if %0 -> (i32) {
-    scf.yield %c32_i32 : i32
-  } else {
-    %c1 = arith.constant 1 : index
-    %c1_i32 = arith.constant 1 : i32
-    %c32 = arith.constant 32 : index
-    %c0_i32_0 = arith.constant 0 : i32
-    %2:2 = scf.for %arg1 = %c1 to %c32 step %c1 iter_args(%arg2 = %arg0, %arg3 = %c0_i32_0) -> (i32, i32) {
-      %3 = arith.cmpi slt, %arg2, %c0_i32 : i32
-      %4:2 = scf.if %3 -> (i32, i32) {
-        scf.yield %arg2, %arg3 : i32, i32
-      } else {
-        %5 = arith.addi %arg3, %c1_i32 : i32
-        %6 = arith.shli %arg2, %c1_i32 : i32
-        scf.yield %6, %5 : i32, i32
-      }
-      scf.yield %4#0, %4#1 : i32, i32
-    }
-    scf.yield %2#1 : i32
-  }
-  func.return %1 : i32
-}
 ```
 
-The algorithm above is not relevant to this post, but either way it is quite simple: count the leading zeros by shifting the input left one bit at a time until it becomes negative (as a signed integer), because that occurs exactly when its leading bit is a 1. Then add a special case to handle zero, which would loop infinitely otherwise.
+After building MLIR, and from the `llvm-project` base directory, run
 
-Here you can see two more MLIR dialects. [`arith`](https://mlir.llvm.org/docs/Dialects/ArithOps/) is for low-level arithmetic and boolean conditions on integers and floats. You can define constants, compare integers with `arith.cmpi`, and do things like add and bit shift (`arith.shli` is a left shift). [`scf`](https://mlir.llvm.org/docs/Dialects/SCFDialect/), short for "structured control flow," defines for loops, while loops, and control flow branching. `scf.yield` defines the "output" value from each region of an if/else operation or loop body which is necessary here because, as you can see, an `if` operation has a result value.
+```bash
+build/bin/mlir-opt --convert-math-to-llvm mlir/test/Examples/mlir-opt/ctlz.mlir
+```
 
-Two other minor aspects of the syntax are on display. First is the syntax `%4:2`, which defines a variable `%4` which is a tuple of two values. The corresponding `%4#1` accesses the second entry in the tuple. Second, you'll notice there's a type called `index` that is different from `i32`. Though they both represent integers, `index` is intended to be a platform-dependent integer type which is suitable for indexing arrays, representing sizes and dimensions of things, and, in our case, being loop counters and iteration bounds. More details on [`index` in the MLIR docs](https://mlir.llvm.org/docs/Rationale/Rationale/#integer-signedness-semantics).
+which produces
 
-## Lowerings and the math-to-funcs pass
+```mlir
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = "llvm.intr.ctlz"(%arg0) <{is_zero_poison = false}> : (i32) -> i32
+    return %0 : i32
+  }
+}
+```
 
-We have two versions of the same program because one is a lowered version of the other. In most cases, the machine you're going to run a program on has a "count leading zeros" function, so the lowering would simply map `math.ctlz` to the corresponding machine instruction. But if there is no `ctlz` instruction, a lowering can provide an implementation in terms of lower level dialects and ops. Specifically, this one lowers ctlz to {`func`, `arith`, `scf`}.
+As you can see, the `math` dialect was the only thing that changed
+due to the lowering.
+The `func` dialect was left alone,
+even though it also needs to be converted to the `llvm` dialect
+to generate LLVM-IR.
 
-The second version of this code was actually generated by the `mlir-opt` command line tool, which is the main entry-point to running MLIR passes on specific MLIR code. For starters, one can take the `mlir-opt` tool and run it with no arguments on any MLIR code, and it will parse it, verify it is well formed, and print it back out with some slight normalizations. In this case, it will wrap the code in a `module`, which is a namespace isolation mechanism.
+What if the target machine does not have a `ctlz` intrinsic?
+In this case, one can run the `--convert-math-to-funcs`
+pass, which replaces the op with an implementation using
+other MLIR dialects.
 
 ```bash
-$ echo 'func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
-}' > ctlz.mlir
-$ bazel run @llvm-project//mlir:mlir-opt -- $(pwd)/ctlz.mlir
-<... snip ...>
+build/bin/mlir-opt --convert-math-to-funcs=convert-ctlz mlir/test/Examples/mlir-opt/ctlz.mlir
+```
+
+You will see something similar to:
+
+```mlir
 module {
   func.func @main(%arg0: i32) -> i32 {
-    %0 = math.ctlz %arg0 : i32
+    %0 = call @__mlir_math_ctlz_i32(%arg0) : (i32) -> i32
     return %0 : i32
   }
+  func.func private @__mlir_math_ctlz_i32(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+    %c32_i32 = arith.constant 32 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %0 = arith.cmpi eq, %arg0, %c0_i32 : i32
+    %1 = scf.if %0 -> (i32) {
+      scf.yield %c32_i32 : i32
+    } else {
+      %c1 = arith.constant 1 : index
+      %c1_i32 = arith.constant 1 : i32
+      %c32 = arith.constant 32 : index
+      %c0_i32_0 = arith.constant 0 : i32
+      %2:2 = scf.for %arg1 = %c1 to %c32 step %c1 iter_args(%arg2 = %arg0, %arg3 = %c0_i32_0) -> (i32, i32) {
+        %3 = arith.cmpi slt, %arg2, %c0_i32 : i32
+        %4:2 = scf.if %3 -> (i32, i32) {
+          scf.yield %arg2, %arg3 : i32, i32
+        } else {
+          %5 = arith.addi %arg3, %c1_i32 : i32
+          %6 = arith.shli %arg2, %c1_i32 : i32
+          scf.yield %6, %5 : i32, i32
+        }
+        scf.yield %4#0, %4#1 : i32, i32
+      }
+      scf.yield %2#1 : i32
+    }
+    return %1 : i32
+  }
 }
 ```
 
-**Aside:** The `-- $(pwd)/ctlz.mlir` is a quirk of bazel. When one program runs another program, the `--` is the standard mechanism to separate CLI flags from the runner program (`bazel`) and the run program (`mlir-opt`). Everything after `--` goes to `mlir-opt`. Also, the need for `$(pwd)` is because when bazel runs `mlir-opt`, it runs it with a working directory that is in some temporary, isolated location on the filesystem. So we need to give it an absolute path to the MLIR file to input. Or we could pipe from standard in. Or we could run the `mlir-opt` binary directly from `bazel-bin/external/llvm-project/mlir/mlir-opt`.
+The algorithm above is not relevant to this tutorial,
+but either way it is quite simple:
+count the leading zeros by shifting the input left one bit at a time
+until it becomes negative (as a signed integer),
+because that occurs exactly when its leading bit is a 1.
+Then add a special case to handle zero,
+which would loop infinitely otherwise.
+
+Here you can see two more MLIR dialects.
+[`arith`](https://mlir.llvm.org/docs/Dialects/ArithOps/)
+is for low-level arithmetic
+and boolean conditions on integers and floats.
+You can define constants,
+compare integers with `arith.cmpi`,
+and do things like add and bit shift (`arith.shli` is a left shift).
+[`scf`](https://mlir.llvm.org/docs/Dialects/SCFDialect/),
+short for "structured control flow,"
+defines for loops, while loops,
+and control flow branching using regions.
+`scf.yield` defines the "output" value
+from each region of an if/else operation or loop body
+which is necessary here because an `if` operation has a result value.
+The "structured" in `scf` is in contrast to
+[`cf`](https://mlir.llvm.org/docs/Dialects/ControlFlowDialect/),
+for "(unstructured) control flow,"
+which does low-level, region-free control flow
+that jumps between SSA blocks on a control flow graph.
+
+Two other minor aspects of the syntax are on display.
+First is the syntax `%4:2`,
+which defines a variable `%4` as a tuple of two values.
+The corresponding `%4#1` accesses the second entry in the tuple.
+Second, there's a type called `index` that is different from `i32`.
+Though they both represent integers,
+[`index`](https://mlir.llvm.org/docs/Rationale/Rationale/#integer-signedness-semantics) is a platform-dependent integer type
+suitable for indexing arrays,
+representing sizes and dimensions of things,
+and, in the above program,
+being loop counters and iteration bounds.
+
+## Optimizing loop nests
+
+Recall our second program, the poorly-tuned loops.
 
-Next we can run our first lowering, which is already built-in to `mlir-opt`, and which generates the long program above.
+```mlir
+// mlir/test/Examples/mlir-opt/loop-fusion.mlir
+func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+  %0 = memref.alloc() : memref<10xf32>
+  %1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %arg2 = 0 to 10 {
+    affine.store %cst, %0[%arg2] : memref<10xf32>
+    affine.store %cst, %1[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %0[%arg2] : memref<10xf32>
+    %3 = arith.addf %2, %2 : f32
+    affine.store %3, %arg0[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %1[%arg2] : memref<10xf32>
+    %3 = arith.mulf %2, %2 : f32
+    affine.store %3, %arg1[%arg2] : memref<10xf32>
+  }
+  return
+}
+```
+
+Running this with the [`affine-loop-fusion`](https://mlir.llvm.org/docs/Passes/#-affine-loop-fusion) pass
+produces a fused loop.
 
 ```bash
-$ bazel run @llvm-project//mlir:mlir-opt -- --convert-math-to-funcs=convert-ctlz $(pwd)/ctlz.mlir
-<... snip ...>
+build/bin/mlir-opt --affine-loop-fusion mlir/test/Examples/mlir-opt/loop-fusion.mlir
+```
+
+```mlir
 module {
-  func.func @main(%arg0: i32) {
-    %0 = call @__mlir_math_ctlz_i32(%arg0) : (i32) -> i32
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %alloc = memref.alloc() : memref<1xf32>
+    %alloc_0 = memref.alloc() : memref<1xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %alloc[0] : memref<1xf32>
+      affine.store %cst, %alloc_0[0] : memref<1xf32>
+      %0 = affine.load %alloc_0[0] : memref<1xf32>
+      %1 = arith.mulf %0, %0 : f32
+      affine.store %1, %arg1[%arg2] : memref<10xf32>
+      %2 = affine.load %alloc[0] : memref<1xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
     return
   }
-  func.func private @__mlir_math_ctlz_i32(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
-<... snip ...>
+}
+```
+
+This pass has options that allow the user to configure its behavior.
+For example, the `fusion-compute-tolerance` option
+is described as the "fractional increase in additional computation tolerated while fusing."
+If this value is set to zero on the command line,
+the pass will not fuse the loops.
+
+```bash
+build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' mlir/test/Examples/mlir-opt/loop-fusion.mlir
+```
+
+```mlir
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %alloc = memref.alloc() : memref<10xf32>
+    %alloc_0 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %alloc[%arg2] : memref<10xf32>
+      affine.store %cst, %alloc_0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %0 = affine.load %alloc[%arg2] : memref<10xf32>
+      %1 = arith.addf %0, %0 : f32
+      affine.store %1, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %0 = affine.load %alloc_0[%arg2] : memref<10xf32>
+      %1 = arith.mulf %0, %0 : f32
+      affine.store %1, %arg1[%arg2] : memref<10xf32>
+    }
+    return
+  }
+}
+```
+
+Options passed to a pass
+should come in the form of a quoted string
+(to join all options into a single shell argument)
+with space-separated `key=value` pairs.
+
+## Building a pass pipeline on the command line
+
+One can combine passes on the command line in two ways.
+
+First, by simply placing the pass flags one after the other,
+they will be run in order.
+
+```bash
+build/bin/mlir-opt --convert-math-to-llvm --convert-func-to-llvm mlir/test/Examples/mlir-opt/ctlz.mlir
+```
+
+Passes can also be configured to run
+in a way that is limited to a particular sub-IR
+nested under scope-isolated ops like functions.
+For example, one could run `--convert-math-to-llvm`
+on each `func` separately, by running
+
+```bash
+build/bin/mlir-opt mlir/test/Examples/mlir-opt/ctlz.mlir --pass-pipeline='
+    builtin.module(
+        convert-math-to-funcs{convert-ctlz=1},
+        func.func(cse,canonicalize),
+        convert-scf-to-cf,
+        convert-to-llvm
+    )'
 ```
 
-Each pass gets its own command line flag, some are grouped into pipelines, and the `--pass-pipeline` command line flag can be used to provide a (serialized version of) an ordered list of passes to run on the input MLIR.1
+The outer nesting tells `mlir-opt` to run the pass pipeline
+on each `module` op,
+and then within that to run `convert-math-to-funcs`,
+then (on each `func.func` op), the [`cse`](https://mlir.llvm.org/docs/Passes/#-cse)
+and [`canonicalize`](https://mlir.llvm.org/docs/Passes/#-canonicalize) passes,
+and then convert the rest to the `llvm` dialect.
 
-We won't cover the internal workings of the `math-to-funcs` pass in this or a future article, but next time we will actually write our own, simpler pass that does something nontrivial. Until then, I'll explain a bit about how testing works in MLIR, using these two ctlz programs as example test cases.
+For a spec of the pass-pipeline textual description language,
+see [the docs](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
 
-For those who are interested, the MLIR documentation contains a [complete list of passes](https://mlir.llvm.org/docs/Passes/) owned by the upstream MLIR project, which can be used by invoking the corresponding command line flag or nesting it inside of a larger `--pass-pipeline`.
+## Further readering
 
-## Optimizing `affine.for`
+- [List of passes](https://mlir.llvm.org/docs/Passes/)
+- [List of dialects](https://mlir.llvm.org/docs/Dialects/)
diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
new file mode 100644
index 0000000000000..9c3393c259bf8
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -0,0 +1,4 @@
+func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}
diff --git a/mlir/test/Examples/mlir-opt/loop-fusion.mlir b/mlir/test/Examples/mlir-opt/loop-fusion.mlir
new file mode 100644
index 0000000000000..0f9e37faa6860
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop-fusion.mlir
@@ -0,0 +1,20 @@
+func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+  %0 = memref.alloc() : memref<10xf32>
+  %1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %arg2 = 0 to 10 {
+    affine.store %cst, %0[%arg2] : memref<10xf32>
+    affine.store %cst, %1[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %0[%arg2] : memref<10xf32>
+    %3 = arith.addf %2, %2 : f32
+    affine.store %3, %arg0[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %1[%arg2] : memref<10xf32>
+    %3 = arith.mulf %2, %2 : f32
+    affine.store %3, %arg1[%arg2] : memref<10xf32>
+  }
+  return
+}

>From 46e2396a5ff8dc5c0a4e5644c64727962d9b201e Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Tue, 18 Jun 2024 16:35:27 -0700
Subject: [PATCH 04/19] Fix up test files

---
 mlir/docs/Tutorials/MlirOpt.md                |  6 ++---
 mlir/test/Examples/mlir-opt/ctlz.mlir         |  5 ++++
 mlir/test/Examples/mlir-opt/ctlz_funcs.mlir   | 10 ++++++++
 mlir/test/Examples/mlir-opt/ctlz_llvm.mlir    |  8 ++++++
 .../test/Examples/mlir-opt/ctlz_pipeline.mlir | 15 +++++++++++
 mlir/test/Examples/mlir-opt/loop_fusion.mlir  | 25 +++++++++++++++++++
 ...p-fusion.mlir => loop_fusion_default.mlir} |  4 +++
 .../mlir-opt/loop_fusion_options.mlir         | 24 ++++++++++++++++++
 8 files changed, 94 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
 create mode 100644 mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
 create mode 100644 mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
 create mode 100644 mlir/test/Examples/mlir-opt/loop_fusion.mlir
 rename mlir/test/Examples/mlir-opt/{loop-fusion.mlir => loop_fusion_default.mlir} (84%)
 create mode 100644 mlir/test/Examples/mlir-opt/loop_fusion_options.mlir

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index f8a4f312a87dd..95babc939e427 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -296,7 +296,7 @@ being loop counters and iteration bounds.
 Recall our second program, the poorly-tuned loops.
 
 ```mlir
-// mlir/test/Examples/mlir-opt/loop-fusion.mlir
+// mlir/test/Examples/mlir-opt/loop_fusion.mlir
 func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   %0 = memref.alloc() : memref<10xf32>
   %1 = memref.alloc() : memref<10xf32>
@@ -323,7 +323,7 @@ Running this with the [`affine-loop-fusion`](https://mlir.llvm.org/docs/Passes/#
 produces a fused loop.
 
 ```bash
-build/bin/mlir-opt --affine-loop-fusion mlir/test/Examples/mlir-opt/loop-fusion.mlir
+build/bin/mlir-opt --affine-loop-fusion mlir/test/Examples/mlir-opt/loop_fusion.mlir
 ```
 
 ```mlir
@@ -354,7 +354,7 @@ If this value is set to zero on the command line,
 the pass will not fuse the loops.
 
 ```bash
-build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' mlir/test/Examples/mlir-opt/loop-fusion.mlir
+build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' mlir/test/Examples/mlir-opt/loop_fusion.mlir
 ```
 
 ```mlir
diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
index 9c3393c259bf8..f9b667851792c 100644
--- a/mlir/test/Examples/mlir-opt/ctlz.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -1,3 +1,8 @@
+// This file is left in-tree despite having no assertions so it can be
+// referenced by the tutorial text.
+
+// RUN: mlir-opt %s
+
 func.func @main(%arg0: i32) -> i32 {
   %0 = math.ctlz %arg0 : i32
   func.return %0 : i32
diff --git a/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir b/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
new file mode 100644
index 0000000000000..894f24d56dc8a
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt --convert-math-to-funcs=convert-ctlz %s | FileCheck %s
+
+// CHECK-LABEL: @main
+// CHECK-NEXT: call @__mlir_math_ctlz_i32
+
+// CHECK-LABEL: func.func private @__mlir_math_ctlz_i32
+func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}
diff --git a/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir b/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
new file mode 100644
index 0000000000000..d8c5b7550be0c
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt --convert-math-to-llvm %s | FileCheck %s
+
+// CHECK-LABEL: @main
+// CHECK-NEXT: llvm.intr.ctlz
+func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}
diff --git a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
new file mode 100644
index 0000000000000..2b5ce3408e1fd
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt --pass-pipeline='
+// RUN:    builtin.module(
+// RUN:        convert-math-to-funcs{convert-ctlz=1},
+// RUN:        func.func(cse,canonicalize),
+// RUN:        convert-scf-to-cf,
+// RUN:        convert-to-llvm
+// RUN:    )' %s | FileCheck %s
+
+// CHECK-LABEL: @main
+// CHECK: llvm
+func.func @main(%arg0: i32) -> i32 {
+  %0 = math.ctlz %arg0 : i32
+  func.return %0 : i32
+}
+
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion.mlir b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
new file mode 100644
index 0000000000000..24a44d8a53f31
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
@@ -0,0 +1,25 @@
+// This file is left in-tree despite having no assertions so it can be
+// referenced by the tutorial text.
+
+// RUN: mlir-opt %s
+
+func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+  %0 = memref.alloc() : memref<10xf32>
+  %1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %arg2 = 0 to 10 {
+    affine.store %cst, %0[%arg2] : memref<10xf32>
+    affine.store %cst, %1[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %0[%arg2] : memref<10xf32>
+    %3 = arith.addf %2, %2 : f32
+    affine.store %3, %arg0[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %1[%arg2] : memref<10xf32>
+    %3 = arith.mulf %2, %2 : f32
+    affine.store %3, %arg1[%arg2] : memref<10xf32>
+  }
+  return
+}
diff --git a/mlir/test/Examples/mlir-opt/loop-fusion.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
similarity index 84%
rename from mlir/test/Examples/mlir-opt/loop-fusion.mlir
rename to mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
index 0f9e37faa6860..b22bb2546d49e 100644
--- a/mlir/test/Examples/mlir-opt/loop-fusion.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
@@ -1,3 +1,7 @@
+// RUN: mlir-opt --affine-loop-fusion %s | FileCheck %s
+
+// CHECK-LABEL: @producer_consumer_fusion
+// CHECK-COUNT-1: affine.for
 func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   %0 = memref.alloc() : memref<10xf32>
   %1 = memref.alloc() : memref<10xf32>
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
new file mode 100644
index 0000000000000..47e75ae86fc9b
--- /dev/null
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' %s | FileCheck %s
+
+// CHECK-LABEL: @producer_consumer_fusion
+// CHECK-COUNT-3: affine.for
+func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+  %0 = memref.alloc() : memref<10xf32>
+  %1 = memref.alloc() : memref<10xf32>
+  %cst = arith.constant 0.000000e+00 : f32
+  affine.for %arg2 = 0 to 10 {
+    affine.store %cst, %0[%arg2] : memref<10xf32>
+    affine.store %cst, %1[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %0[%arg2] : memref<10xf32>
+    %3 = arith.addf %2, %2 : f32
+    affine.store %3, %arg0[%arg2] : memref<10xf32>
+  }
+  affine.for %arg2 = 0 to 10 {
+    %2 = affine.load %1[%arg2] : memref<10xf32>
+    %3 = arith.mulf %2, %2 : f32
+    affine.store %3, %arg1[%arg2] : memref<10xf32>
+  }
+  return
+}

>From c5252cd109151f6947972ab14cad2a293b13b4fa Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Wed, 19 Jun 2024 12:21:12 -0700
Subject: [PATCH 05/19] fix RUN directive for pipeline

---
 mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
index 2b5ce3408e1fd..70b639a09062f 100644
--- a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
@@ -1,10 +1,4 @@
-// RUN: mlir-opt --pass-pipeline='
-// RUN:    builtin.module(
-// RUN:        convert-math-to-funcs{convert-ctlz=1},
-// RUN:        func.func(cse,canonicalize),
-// RUN:        convert-scf-to-cf,
-// RUN:        convert-to-llvm
-// RUN:    )' %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline=' builtin.module( convert-math-to-funcs{convert-ctlz=1}, func.func(cse,canonicalize), convert-scf-to-cf, convert-to-llvm)' %s | FileCheck %s
 
 // CHECK-LABEL: @main
 // CHECK: llvm

>From 7d4d56e86a9fec42953cdb29bb1df6852096a897 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Wed, 19 Jun 2024 13:07:37 -0700
Subject: [PATCH 06/19] fix comments

---
 mlir/docs/Tutorials/MlirOpt.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 95babc939e427..06cb8f8c8016d 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -71,6 +71,7 @@ The first defines a function that counts the leading zeroes of a 32-bit integer
 using the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation.
 
 ```mlir
+// mlir/test/Examples/mlir-opt/ctlz.mlir
 func.func @main(%arg0: i32) -> i32 {
   %0 = math.ctlz %arg0 : i32
   func.return %0 : i32
@@ -110,6 +111,7 @@ The second program is a sequence of loops
 that exhibits poor cache locality.
 
 ```mlir
+// mlir/test/Examples/mlir-opt/loop_fusion.mlir
 func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
   %0 = memref.alloc() : memref<10xf32>
   %1 = memref.alloc() : memref<10xf32>
@@ -175,7 +177,7 @@ to generate LLVM-IR.
 Recall our ctlz program:
 
 ```mlir
-# mlir/test/Examples/mlir-opt/ctlz.mlir
+// mlir/test/Examples/mlir-opt/ctlz.mlir
 func.func @main(%arg0: i32) -> i32 {
   %0 = math.ctlz %arg0 : i32
   func.return %0 : i32

>From 9f4832d8be9a7768c51807df95acef6006012dc9 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Wed, 19 Jun 2024 15:21:42 -0700
Subject: [PATCH 07/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 06cb8f8c8016d..e1efc30e6d7eb 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -20,7 +20,7 @@ see [the MLIR paper](https://arxiv.org/abs/2002.11054).
 Two of the central concepts in MLIR are *dialects* and *lowerings*.
 In traditional compilers, there is typically one "dialect,"
 called an *intermediate representation*, or IR,
-that is the textual or data-structural description of a program
+that is the textual or data-structural representation of a program
 within the scope of the compiler's execution.
 For example, in GCC the IR is called GIMPLE,
 and in LLVM it's called LLVM-IR.

>From c94e76c7061e41af73a2d87f94f15a57493b378c Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Wed, 19 Jun 2024 15:24:25 -0700
Subject: [PATCH 08/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index e1efc30e6d7eb..e1d592443924e 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -66,7 +66,7 @@ and produces non-MLIR program representations as output.
 
 ## Two example programs
 
-Here are two MLIR programs.
+Here are the textual representation of two MLIR programs.
 The first defines a function that counts the leading zeroes of a 32-bit integer (`i32`)
 using the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation.
 

>From 93e28ebc047dfe0b4b906bba444aa2d15ed208b1 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Wed, 19 Jun 2024 15:41:02 -0700
Subject: [PATCH 09/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index e1d592443924e..e71dbe4ada396 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -157,7 +157,7 @@ as well as emitting debug and diagnostic data.
 
 Running `mlir-opt` with no flags will consume MLIR input
 from standard in, parse and run verifiers on it,
-and write the MLIR back to standard out.
+and write the textual format back to standard out.
 This is a good way to test if an input MLIR is well-formed.
 
 `mlir-opt --help` shows a complete list of flags

>From cf530f5eedb515edb14a27eaf9c69872c2464b15 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Wed, 19 Jun 2024 15:41:56 -0700
Subject: [PATCH 10/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index e71dbe4ada396..eaee70ac3c210 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -155,7 +155,7 @@ the `mlir-opt` binary (located in `build/bin`)
 is the entry point for running passes and lowerings,
 as well as emitting debug and diagnostic data.
 
-Running `mlir-opt` with no flags will consume MLIR input
+Running `mlir-opt` with no flags will consume textual or bytecode IR
 from standard in, parse and run verifiers on it,
 and write the textual format back to standard out.
 This is a good way to test if an input MLIR is well-formed.

>From adcece80be9b13eb892b8dbc0f04979eda431a24 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:46:10 -0700
Subject: [PATCH 11/19] simplify presentation to just mlir-opt details

---
 mlir/docs/Tutorials/MlirOpt.md             | 291 +++------------------
 mlir/test/Examples/mlir-opt/ctlz_llvm.mlir |   4 +-
 2 files changed, 43 insertions(+), 252 deletions(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index eaee70ac3c210..7b37040462385 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -1,154 +1,22 @@
 # Using `mlir-opt`
 
-`mlir-opt` is the command-line entry point for running passes and lowerings on MLIR code.
-This tutorial will explain how to use `mlir-opt` to run passes, and explain
-some details about MLIR's built-in dialects along the way.
+`mlir-opt` is a command-line entry point for running passes and lowerings on MLIR code.
+This tutorial will explain how to use `mlir-opt`, show some examples of its usage,
+and mention some useful tips for working with it.
 
 Prerequisites:
 
 - [Building MLIR from source](/getting_started/)
+- [MLIR Language Reference](/docs/LangRef/)
 
 [TOC]
 
-## Overview
-
-We start with a brief summary of context that helps to frame
-the uses of `mlir-opt` detailed in this article.
-For a deeper dive on motivation and design,
-see [the MLIR paper](https://arxiv.org/abs/2002.11054).
-
-Two of the central concepts in MLIR are *dialects* and *lowerings*.
-In traditional compilers, there is typically one "dialect,"
-called an *intermediate representation*, or IR,
-that is the textual or data-structural representation of a program
-within the scope of the compiler's execution.
-For example, in GCC the IR is called GIMPLE,
-and in LLVM it's called LLVM-IR.
-Compilers typically convert an input program to their IR,
-run optimization passes,
-and then convert the optimized IR to machine code.
-
-MLIR's philosophy is to split the job into smaller steps.
-First, MLIR allows one to define many IRs called *dialects*,
-some considered "high level" and some "low level,"
-but each with a set of types, operations, metadata,
-and semantics that defines what the operations do.
-Different dialects may coexist in the same program.
-Then, one writes a set of *lowering passes*
-that incrementally converts different parts of the program
-from higher level dialects to lower and lower dialects
-until you get to machine code
-(or, in many cases, LLVM, which finishes the job).
-Along the way,
-*optimizing passes* are run to make the code more efficient.
-The main point here is that the high level dialects exist
-*so that* they make it easy to write these important optimizing passes.
-
-A central motivation for building MLIR
-was to build the `affine` dialect,
-which is designed to enable [polyhedral optimizations](https://polyhedral.info/)
-for loop transformations.
-Compiler engineers had previously implemented polyhedral optimizations
-in LLVM and GCC (without an `affine` dialect),
-and it was difficult because they had to reconstruct well-structured loop nests
-from a much more complicated set of low-level operations.
-Having a higher level `affine` dialect preserves the loop nest structure
-at an abstraction layer that makes it easier to write optimizations,
-and then discards it during lowering passes.
-
-The `mlir-opt` tool can run both
-optimization passes and lowerings,
-though the final code generation
-is performed by a different tool called `mlir-translate`.
-In particular, `mlir-opt` consumes MLIR as input and produce MLIR as output,
-while `mlir-translate` consumes MLIR as input
-and produces non-MLIR program representations as output.
-
-## Two example programs
-
-Here are the textual representation of two MLIR programs.
-The first defines a function that counts the leading zeroes of a 32-bit integer (`i32`)
-using the [`math` dialect's](/docs/Dialects/MathOps/) `ctlz` operation.
+## `mlir-opt` basics
 
-```mlir
-// mlir/test/Examples/mlir-opt/ctlz.mlir
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
-}
-```
-
-This shows the basic structure of an MLIR operation
-([see here](https://mlir.llvm.org/docs/LangRef/#operations) for a more complete spec).
-Variable names are prefixed with `%`,
-functions by `@`,
-and each variable/value in a program has a type,
-often expressed after a colon.
-In this case all the types are `i32`,
-except for the function type which is `(i32) -> i32`
-(not specified explicitly above, but you'll see it in the `func.call` later).
-
-Each statement is anchored around an expression like `math.ctlz`
-which specifies the dialect [`math`](https://mlir.llvm.org/docs/Dialects/MathOps/) via a namespace,
-and the operation [`ctlz`](https://mlir.llvm.org/docs/Dialects/MathOps/#mathctlz-mathcountleadingzerosop) after the `.`.
-The rest of the syntax of the operation
-is determined by a parser defined by the dialect,
-and so many operations will have different syntaxes.
-In the case of `math.ctlz`,
-the sole argument is an integer whose leading zeros are to be counted,
-and the trailing ` : i32` denotes the output type storing the count.
-
-It's important to note that [`func`](https://mlir.llvm.org/docs/Dialects/Func/) is itself a dialect,
-and [`func.func`](https://mlir.llvm.org/docs/Dialects/Func/#funcfunc-funcfuncop) is an operation,
-where the braces and the function's body is part of the syntax.
-In MLIR a list of operations within braces is called a [*region*](https://mlir.llvm.org/docs/LangRef/#regions),
-and an operation can have zero regions like `math.ctlz`,
-one region like `func.func`,
-or multiple regions like [`scf.if`](https://mlir.llvm.org/docs/Dialects/SCFDialect/#scfif-scfifop),
-which has a region for each of its two control flow branches.
-
-The second program is a sequence of loops
-that exhibits poor cache locality.
-
-```mlir
-// mlir/test/Examples/mlir-opt/loop_fusion.mlir
-func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-  %0 = memref.alloc() : memref<10xf32>
-  %1 = memref.alloc() : memref<10xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  affine.for %arg2 = 0 to 10 {
-    affine.store %cst, %0[%arg2] : memref<10xf32>
-    affine.store %cst, %1[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %0[%arg2] : memref<10xf32>
-    %3 = arith.addf %2, %2 : f32
-    affine.store %3, %arg0[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %1[%arg2] : memref<10xf32>
-    %3 = arith.mulf %2, %2 : f32
-    affine.store %3, %arg1[%arg2] : memref<10xf32>
-  }
-  return
-}
-```
-
-This program introduces some additional dialects.
-The [`affine` dialect](https://mlir.llvm.org/docs/Dialects/Affine/) mentioned in the introduction
-represents well-structured loop nests,
-and the [`affine.for` operation](https://mlir.llvm.org/docs/Dialects/Affine/#affinefor-affineaffineforop)
-whose region corresponds to the loop's body.
-`affine.for` also showcases some custom-defined syntax
-to represent the loop bounds and loop induction variable.
-The [`memref` dialect](https://mlir.llvm.org/docs/Dialects/MemRef/)
-defines types and operations related to memory management
-with pointer semantics.
-Note also that while `memref` has store and load operations,
-`affine` has its own that limit what types of memory accesses are allowed,
-so as to ensure the well-structuredness of the loop nest.
-
-## Running `mlir-opt`
+The `mlir-opt` tool loads a textual IR or bytecode into an in-memory structure,
+and optionally executes a sequence of passes
+before serializing back the IR (textual form by default).
+It is intended as a testing and debugging utility.
 
 After building the MLIR project,
 the `mlir-opt` binary (located in `build/bin`)
@@ -164,17 +32,11 @@ This is a good way to test if an input MLIR is well-formed.
 (there are nearly 1000).
 Each pass gets its own flag.
 
-## Lowering `ctlz`
-
-Next we will show two of MLIR's lowering passes.
-The first, `convert-math-to-llvm`, converts the `ctlz` op
-to the `llvm` dialect's [`intr.ctlz` op](https://mlir.llvm.org/docs/Dialects/LLVM/#llvmintrctlz-llvmcountleadingzerosop)
-which is an LLVM intrinsic.
-Note that `llvm` here is MLIR's `llvm` dialect,
-which would still need to be processed through `mlir-translate`
-to generate LLVM-IR.
+## Running a pass
 
-Recall our ctlz program:
+Next we run [`--convert-to-llvm`](/docs/Passes/#-convert-to-llvm),
+which converts all supported dialects to the `llvm` dialect,
+on the following IR:
 
 ```mlir
 // mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -201,101 +63,14 @@ module {
 }
 ```
 
-As you can see, the `math` dialect was the only thing that changed
-due to the lowering.
-The `func` dialect was left alone,
-even though it also needs to be converted to the `llvm` dialect
+Note that `llvm` here is MLIR's `llvm` dialect,
+which would still need to be processed through `mlir-translate`
 to generate LLVM-IR.
 
-What if the target machine does not have a `ctlz` intrinsic?
-In this case, one can run the `--convert-math-to-funcs`
-pass, which replaces the op with an implementation using
-other MLIR dialects.
+## Running a pass with options
 
-```bash
-build/bin/mlir-opt --convert-math-to-funcs=convert-ctlz mlir/test/Examples/mlir-opt/ctlz.mlir
-```
-
-You will see something similar to:
-
-```mlir
-module {
-  func.func @main(%arg0: i32) -> i32 {
-    %0 = call @__mlir_math_ctlz_i32(%arg0) : (i32) -> i32
-    return %0 : i32
-  }
-  func.func private @__mlir_math_ctlz_i32(%arg0: i32) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
-    %c32_i32 = arith.constant 32 : i32
-    %c0_i32 = arith.constant 0 : i32
-    %0 = arith.cmpi eq, %arg0, %c0_i32 : i32
-    %1 = scf.if %0 -> (i32) {
-      scf.yield %c32_i32 : i32
-    } else {
-      %c1 = arith.constant 1 : index
-      %c1_i32 = arith.constant 1 : i32
-      %c32 = arith.constant 32 : index
-      %c0_i32_0 = arith.constant 0 : i32
-      %2:2 = scf.for %arg1 = %c1 to %c32 step %c1 iter_args(%arg2 = %arg0, %arg3 = %c0_i32_0) -> (i32, i32) {
-        %3 = arith.cmpi slt, %arg2, %c0_i32 : i32
-        %4:2 = scf.if %3 -> (i32, i32) {
-          scf.yield %arg2, %arg3 : i32, i32
-        } else {
-          %5 = arith.addi %arg3, %c1_i32 : i32
-          %6 = arith.shli %arg2, %c1_i32 : i32
-          scf.yield %6, %5 : i32, i32
-        }
-        scf.yield %4#0, %4#1 : i32, i32
-      }
-      scf.yield %2#1 : i32
-    }
-    return %1 : i32
-  }
-}
-```
-
-The algorithm above is not relevant to this tutorial,
-but either way it is quite simple:
-count the leading zeros by shifting the input left one bit at a time
-until it becomes negative (as a signed integer),
-because that occurs exactly when its leading bit is a 1.
-Then add a special case to handle zero,
-which would loop infinitely otherwise.
-
-Here you can see two more MLIR dialects.
-[`arith`](https://mlir.llvm.org/docs/Dialects/ArithOps/)
-is for low-level arithmetic
-and boolean conditions on integers and floats.
-You can define constants,
-compare integers with `arith.cmpi`,
-and do things like add and bit shift (`arith.shli` is a left shift).
-[`scf`](https://mlir.llvm.org/docs/Dialects/SCFDialect/),
-short for "structured control flow,"
-defines for loops, while loops,
-and control flow branching using regions.
-`scf.yield` defines the "output" value
-from each region of an if/else operation or loop body
-which is necessary here because an `if` operation has a result value.
-The "structured" in `scf` is in contrast to
-[`cf`](https://mlir.llvm.org/docs/Dialects/ControlFlowDialect/),
-for "(unstructured) control flow,"
-which does low-level, region-free control flow
-that jumps between SSA blocks on a control flow graph.
-
-Two other minor aspects of the syntax are on display.
-First is the syntax `%4:2`,
-which defines a variable `%4` as a tuple of two values.
-The corresponding `%4#1` accesses the second entry in the tuple.
-Second, there's a type called `index` that is different from `i32`.
-Though they both represent integers,
-[`index`](https://mlir.llvm.org/docs/Rationale/Rationale/#integer-signedness-semantics) is a platform-dependent integer type
-suitable for indexing arrays,
-representing sizes and dimensions of things,
-and, in the above program,
-being loop counters and iteration bounds.
-
-## Optimizing loop nests
-
-Recall our second program, the poorly-tuned loops.
+Next we will show how to run a pass that takes configuration options.
+Consider the following IR containing loops with poor cache locality.
 
 ```mlir
 // mlir/test/Examples/mlir-opt/loop_fusion.mlir
@@ -356,7 +131,8 @@ If this value is set to zero on the command line,
 the pass will not fuse the loops.
 
 ```bash
-build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' mlir/test/Examples/mlir-opt/loop_fusion.mlir
+build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' \
+mlir/test/Examples/mlir-opt/loop_fusion.mlir
 ```
 
 ```mlir
@@ -387,7 +163,7 @@ module {
 Options passed to a pass
 should come in the form of a quoted string
 (to join all options into a single shell argument)
-with space-separated `key=value` pairs.
+with space-separated `key=value` pairs for each option.
 
 ## Building a pass pipeline on the command line
 
@@ -397,7 +173,7 @@ First, by simply placing the pass flags one after the other,
 they will be run in order.
 
 ```bash
-build/bin/mlir-opt --convert-math-to-llvm --convert-func-to-llvm mlir/test/Examples/mlir-opt/ctlz.mlir
+build/bin/mlir-opt --convert-to-llvm --canonicalize mlir/test/Examples/mlir-opt/ctlz.mlir
 ```
 
 Passes can also be configured to run
@@ -409,23 +185,38 @@ on each `func` separately, by running
 ```bash
 build/bin/mlir-opt mlir/test/Examples/mlir-opt/ctlz.mlir --pass-pipeline='
     builtin.module(
-        convert-math-to-funcs{convert-ctlz=1},
         func.func(cse,canonicalize),
-        convert-scf-to-cf,
         convert-to-llvm
     )'
 ```
 
 The outer nesting tells `mlir-opt` to run the pass pipeline
 on each `module` op,
-and then within that to run `convert-math-to-funcs`,
-then (on each `func.func` op), the [`cse`](https://mlir.llvm.org/docs/Passes/#-cse)
+and then within that to run (on each `func.func` op),
+the [`cse`](https://mlir.llvm.org/docs/Passes/#-cse)
 and [`canonicalize`](https://mlir.llvm.org/docs/Passes/#-canonicalize) passes,
 and then convert the rest to the `llvm` dialect.
 
 For a spec of the pass-pipeline textual description language,
 see [the docs](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
 
+## Useful CLI flags
+
+- `--debug` prints all debug information produced by `LLVM_DEBUG` calls.
+- `--debug-only="my-tag"` prints only the debug information produced by `LLVM_DEBUG`
+  in files that have the macro `#define DEBUG_TYPE "my-tag"`.
+  This often allows you to print only debug information associated with a specific pass.
+    - `"greedy-rewriter"` only prints debug information
+      for patterns applied with the greedy rewriter engine.
+    - `"dialect-conversion"` only prints debug information
+      for the dialect conversion framework.
+ - `--emit-bytecode` emits MLIR in the bytecode format.
+ - `--mlir-pass-statistics` print statistics about the passes run.
+    These are generated via [pass statistics](https://mlir.llvm.org/docs/PassManagement/#pass-statistics).
+ - `--mlir-print-ir-after-all` prints the IR after each pass.
+    See also `--mlir-print-ir-after-change` and `--mlir-print-ir-after-failure`
+ - `--mlir-timing` displays execution times of each pass.
+
 ## Further readering
 
 - [List of passes](https://mlir.llvm.org/docs/Passes/)
diff --git a/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir b/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
index d8c5b7550be0c..01d6c2748f59a 100644
--- a/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt --convert-math-to-llvm %s | FileCheck %s
+// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
 
 // CHECK-LABEL: @main
-// CHECK-NEXT: llvm.intr.ctlz
+// CHECK: llvm.intr.ctlz
 func.func @main(%arg0: i32) -> i32 {
   %0 = math.ctlz %arg0 : i32
   func.return %0 : i32

>From 5c91b9bde408cdfc3c1caa95fad859aecf5c05e7 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:49:52 -0700
Subject: [PATCH 12/19] simplify tests

---
 mlir/test/Examples/mlir-opt/ctlz.mlir          |  7 +++----
 mlir/test/Examples/mlir-opt/ctlz_funcs.mlir    | 10 ----------
 mlir/test/Examples/mlir-opt/ctlz_llvm.mlir     |  8 --------
 mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir |  2 +-
 4 files changed, 4 insertions(+), 23 deletions(-)
 delete mode 100644 mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
 delete mode 100644 mlir/test/Examples/mlir-opt/ctlz_llvm.mlir

diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
index f9b667851792c..01d6c2748f59a 100644
--- a/mlir/test/Examples/mlir-opt/ctlz.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -1,8 +1,7 @@
-// This file is left in-tree despite having no assertions so it can be
-// referenced by the tutorial text.
-
-// RUN: mlir-opt %s
+// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
 
+// CHECK-LABEL: @main
+// CHECK: llvm.intr.ctlz
 func.func @main(%arg0: i32) -> i32 {
   %0 = math.ctlz %arg0 : i32
   func.return %0 : i32
diff --git a/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir b/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
deleted file mode 100644
index 894f24d56dc8a..0000000000000
--- a/mlir/test/Examples/mlir-opt/ctlz_funcs.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: mlir-opt --convert-math-to-funcs=convert-ctlz %s | FileCheck %s
-
-// CHECK-LABEL: @main
-// CHECK-NEXT: call @__mlir_math_ctlz_i32
-
-// CHECK-LABEL: func.func private @__mlir_math_ctlz_i32
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
-}
diff --git a/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir b/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
deleted file mode 100644
index 01d6c2748f59a..0000000000000
--- a/mlir/test/Examples/mlir-opt/ctlz_llvm.mlir
+++ /dev/null
@@ -1,8 +0,0 @@
-// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
-
-// CHECK-LABEL: @main
-// CHECK: llvm.intr.ctlz
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
-}
diff --git a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
index 70b639a09062f..0d0966f8bd320 100644
--- a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --pass-pipeline=' builtin.module( convert-math-to-funcs{convert-ctlz=1}, func.func(cse,canonicalize), convert-scf-to-cf, convert-to-llvm)' %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline=' builtin.module(convert-math-to-funcs{convert-ctlz=1}, func.func(cse,canonicalize), convert-scf-to-cf, convert-to-llvm)' %s | FileCheck %s
 
 // CHECK-LABEL: @main
 // CHECK: llvm

>From fd4ed77aa43e600c287220fc7d4b96fb872d2d25 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Wed, 24 Jul 2024 17:12:45 -0700
Subject: [PATCH 13/19] improve pass pipeline section

---
 mlir/docs/Tutorials/MlirOpt.md                | 78 +++++++++++++++----
 .../test/Examples/mlir-opt/ctlz_pipeline.mlir | 33 ++++++--
 2 files changed, 92 insertions(+), 19 deletions(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 7b37040462385..26bb76e92504e 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -176,26 +176,78 @@ they will be run in order.
 build/bin/mlir-opt --convert-to-llvm --canonicalize mlir/test/Examples/mlir-opt/ctlz.mlir
 ```
 
-Passes can also be configured to run
-in a way that is limited to a particular sub-IR
-nested under scope-isolated ops like functions.
-For example, one could run `--convert-math-to-llvm`
-on each `func` separately, by running
+This simplified form is useful, but only works for
+passes which "anchor" on `builtin.module`.
+[Pass anchoring](https://mlir.llvm.org/docs/PassManagement/#oppassmanager)
+is a way for passes to specify
+that they only run on particular ops
+or at a particular level of IR nesting.
+If you use the short form with a pass that is not anchored properly,
+it will not run.
+
+To use passes that have non-trivial anchoring,
+and to be more precise about where and how passes should run,
+one can use the `pass-pipeline` flag.
+
+For example, consider the following IR which has the same redundant code,
+but in two different levels of nesting.
+
+```mlir
+module {
+  module {
+    func.func @func1(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      func.return %2 : i32
+    }
+  }
+
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
+}
+```
+
+The following pipeline runs `cse` (common subexpression elimination)
+but only on the `func.func` inside the two `builtin.module` ops.
 
 ```bash
 build/bin/mlir-opt mlir/test/Examples/mlir-opt/ctlz.mlir --pass-pipeline='
     builtin.module(
-        func.func(cse,canonicalize),
-        convert-to-llvm
+        builtin.module(
+            func.func(cse,canonicalize),
+            convert-to-llvm
+        )
     )'
 ```
 
-The outer nesting tells `mlir-opt` to run the pass pipeline
-on each `module` op,
-and then within that to run (on each `func.func` op),
-the [`cse`](https://mlir.llvm.org/docs/Passes/#-cse)
-and [`canonicalize`](https://mlir.llvm.org/docs/Passes/#-canonicalize) passes,
-and then convert the rest to the `llvm` dialect.
+The output leaves the `gpu.module` alone
+
+```mlir
+module {
+  module {
+    llvm.func @func1(%arg0: i32) -> i32 {
+      %0 = llvm.add %arg0, %arg0 : i32
+      %1 = llvm.add %0, %0 : i32
+      llvm.return %1 : i32
+    }
+  }
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
+}
+```
 
 For a spec of the pass-pipeline textual description language,
 see [the docs](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
diff --git a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
index 0d0966f8bd320..5ba1f1f869688 100644
--- a/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz_pipeline.mlir
@@ -1,9 +1,30 @@
-// RUN: mlir-opt --pass-pipeline=' builtin.module(convert-math-to-funcs{convert-ctlz=1}, func.func(cse,canonicalize), convert-scf-to-cf, convert-to-llvm)' %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline='builtin.module(builtin.module(func.func(cse,canonicalize),convert-to-llvm))' %s | FileCheck %s
 
-// CHECK-LABEL: @main
-// CHECK: llvm
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
+// CHECK-LABEL: llvm.func @func1
+// CHECK-NEXT: llvm.add
+// CHECK-NEXT: llvm.add
+// CHECK-NEXT: llvm.return
+module {
+  module {
+    func.func @func1(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      func.return %2 : i32
+    }
+  }
+
+  // CHECK-LABEL: @gpu_module
+  // CHECK-LABEL: gpu.func @func2
+  // CHECK-COUNT-3: arith.addi
+  // CHECK-NEXT: gpu.return
+  gpu.module @gpu_module {
+    gpu.func @func2(%arg0: i32) -> i32 {
+      %0 = arith.addi %arg0, %arg0 : i32
+      %1 = arith.addi %arg0, %arg0 : i32
+      %2 = arith.addi %0, %1 : i32
+      gpu.return %2 : i32
+    }
+  }
 }
 

>From b8ea3b96147a2f62a7af114860e0b8e0e3928f7d Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:09:46 -0700
Subject: [PATCH 14/19] only show pass-pipeline

---
 mlir/docs/Tutorials/MlirOpt.md                | 63 ++++++++++---------
 mlir/test/Examples/mlir-opt/ctlz.mlir         |  2 +-
 .../mlir-opt/loop_fusion_default.mlir         |  3 +-
 .../mlir-opt/loop_fusion_options.mlir         |  2 +-
 4 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 26bb76e92504e..5b63b69ff187d 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -30,11 +30,13 @@ This is a good way to test if an input MLIR is well-formed.
 
 `mlir-opt --help` shows a complete list of flags
 (there are nearly 1000).
-Each pass gets its own flag.
+Each pass has its own flag,
+though it is recommended to use `--pass-pipeline`
+to run passes rather than bare flags.
 
 ## Running a pass
 
-Next we run [`--convert-to-llvm`](/docs/Passes/#-convert-to-llvm),
+Next we run [`convert-to-llvm`](/docs/Passes/#-convert-to-llvm),
 which converts all supported dialects to the `llvm` dialect,
 on the following IR:
 
@@ -49,7 +51,7 @@ func.func @main(%arg0: i32) -> i32 {
 After building MLIR, and from the `llvm-project` base directory, run
 
 ```bash
-build/bin/mlir-opt --convert-math-to-llvm mlir/test/Examples/mlir-opt/ctlz.mlir
+build/bin/mlir-opt --pass-pipeline="builtin.module(convert-math-to-llvm)" mlir/test/Examples/mlir-opt/ctlz.mlir
 ```
 
 which produces
@@ -96,11 +98,11 @@ func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>
 }
 ```
 
-Running this with the [`affine-loop-fusion`](https://mlir.llvm.org/docs/Passes/#-affine-loop-fusion) pass
+Running this with the [`affine-loop-fusion`](/docs/Passes/#-affine-loop-fusion) pass
 produces a fused loop.
 
 ```bash
-build/bin/mlir-opt --affine-loop-fusion mlir/test/Examples/mlir-opt/loop_fusion.mlir
+build/bin/mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion)" mlir/test/Examples/mlir-opt/loop_fusion.mlir
 ```
 
 ```mlir
@@ -131,7 +133,7 @@ If this value is set to zero on the command line,
 the pass will not fuse the loops.
 
 ```bash
-build/bin/mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' \
+build/bin/mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion{fusion-compute-tolerance=0})" \
 mlir/test/Examples/mlir-opt/loop_fusion.mlir
 ```
 
@@ -161,34 +163,32 @@ module {
 ```
 
 Options passed to a pass
-should come in the form of a quoted string
-(to join all options into a single shell argument)
-with space-separated `key=value` pairs for each option.
+are specified via the syntax `{option1=value1 option2=value2 ...}`,
+i.e., use space-separated `key=value` pairs for each option.
 
 ## Building a pass pipeline on the command line
 
-One can combine passes on the command line in two ways.
-
-First, by simply placing the pass flags one after the other,
-they will be run in order.
-
-```bash
-build/bin/mlir-opt --convert-to-llvm --canonicalize mlir/test/Examples/mlir-opt/ctlz.mlir
-```
-
-This simplified form is useful, but only works for
-passes which "anchor" on `builtin.module`.
-[Pass anchoring](https://mlir.llvm.org/docs/PassManagement/#oppassmanager)
+The `--pass-pipeline` flag supports combining multiple passes into a pipeline.
+So far we have used the trivial pipeline with a single pass
+that is "anchored" on the `builtin.module` op.
+[Pass anchoring](/docs/PassManagement/#oppassmanager)
 is a way for passes to specify
-that they only run on particular ops
-or at a particular level of IR nesting.
-If you use the short form with a pass that is not anchored properly,
+that they only run on particular ops.
+While many passes are anchored on `builtin.module`,
+if you try to run a pass that is anchored on some other op
+inside `--pass-pipeline="builtin.module(pass-name)"`,
 it will not run.
 
-To use passes that have non-trivial anchoring,
-and to be more precise about where and how passes should run,
-one can use the `pass-pipeline` flag.
+Multiple passes can be chained together
+by providing the pass names in a comma-separated list
+in the `--pass-pipeline` string,
+e.g.,
+`--pass-pipeline="builtin.module(pass1,pass2)"`.
+The passes will be run sequentially.
 
+To use passes that have nontrivial anchoring,
+the appropriate level of nesting must be specified
+in the pass pipeline.
 For example, consider the following IR which has the same redundant code,
 but in two different levels of nesting.
 
@@ -250,7 +250,8 @@ module {
 ```
 
 For a spec of the pass-pipeline textual description language,
-see [the docs](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
+see [the docs](/docs/PassManagement/#textual-pass-pipeline-specification).
+For more general information on pass management, see [Pass Infrastructure](/docs/PassManagement/#).
 
 ## Useful CLI flags
 
@@ -264,12 +265,12 @@ see [the docs](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-
       for the dialect conversion framework.
  - `--emit-bytecode` emits MLIR in the bytecode format.
  - `--mlir-pass-statistics` print statistics about the passes run.
-    These are generated via [pass statistics](https://mlir.llvm.org/docs/PassManagement/#pass-statistics).
+    These are generated via [pass statistics](/docs/PassManagement/#pass-statistics).
  - `--mlir-print-ir-after-all` prints the IR after each pass.
     See also `--mlir-print-ir-after-change` and `--mlir-print-ir-after-failure`
  - `--mlir-timing` displays execution times of each pass.
 
 ## Further readering
 
-- [List of passes](https://mlir.llvm.org/docs/Passes/)
-- [List of dialects](https://mlir.llvm.org/docs/Dialects/)
+- [List of passes](/docs/Passes/)
+- [List of dialects](/docs/Dialects/)
diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
index 01d6c2748f59a..da1a8af9cd42e 100644
--- a/mlir/test/Examples/mlir-opt/ctlz.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --convert-to-llvm %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline="builtin.module(convert-to-llvm)" %s | FileCheck %s
 
 // CHECK-LABEL: @main
 // CHECK: llvm.intr.ctlz
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
index b22bb2546d49e..7c2cc99991f83 100644
--- a/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt --affine-loop-fusion %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion)" %s | FileCheck %s
+
 
 // CHECK-LABEL: @producer_consumer_fusion
 // CHECK-COUNT-1: affine.for
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
index 47e75ae86fc9b..97a40f0ee843a 100644
--- a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --affine-loop-fusion='fusion-compute-tolerance=0' %s | FileCheck %s
+// RUN: mlir-opt --pass-pipeline="builtin.module(affine-loop-fusion{fusion-compute-tolerance=0})" %s | FileCheck %s
 
 // CHECK-LABEL: @producer_consumer_fusion
 // CHECK-COUNT-3: affine.for

>From 643826ae5d3756a323cc3ec5fb07fa8b477600fc Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Fri, 26 Jul 2024 16:59:51 -0700
Subject: [PATCH 15/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 5b63b69ff187d..eceaf5beafb76 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -24,7 +24,7 @@ is the entry point for running passes and lowerings,
 as well as emitting debug and diagnostic data.
 
 Running `mlir-opt` with no flags will consume textual or bytecode IR
-from standard in, parse and run verifiers on it,
+from the standard input, parse and run verifiers on it,
 and write the textual format back to standard out.
 This is a good way to test if an input MLIR is well-formed.
 

>From 99d2396867cb421e5a505fc20c054ed9565d3a71 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Fri, 26 Jul 2024 16:59:58 -0700
Subject: [PATCH 16/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index eceaf5beafb76..715e79ebd6d72 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -25,7 +25,7 @@ as well as emitting debug and diagnostic data.
 
 Running `mlir-opt` with no flags will consume textual or bytecode IR
 from the standard input, parse and run verifiers on it,
-and write the textual format back to standard out.
+and write the textual format back to the standard output.
 This is a good way to test if an input MLIR is well-formed.
 
 `mlir-opt --help` shows a complete list of flags

>From de367fa521e8a1b54e1ad6c5c3d63e290472925d Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Fri, 26 Jul 2024 17:00:21 -0700
Subject: [PATCH 17/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 715e79ebd6d72..94745234f4345 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -170,7 +170,7 @@ i.e., use space-separated `key=value` pairs for each option.
 
 The `--pass-pipeline` flag supports combining multiple passes into a pipeline.
 So far we have used the trivial pipeline with a single pass
-that is "anchored" on the `builtin.module` op.
+that is "anchored" on the top-level `builtin.module` op.
 [Pass anchoring](/docs/PassManagement/#oppassmanager)
 is a way for passes to specify
 that they only run on particular ops.

>From 62c492cbe0f79d0010bf1afa88ebc3ff4aa102bc Mon Sep 17 00:00:00 2001
From: Jeremy Kun <kun.jeremy at gmail.com>
Date: Fri, 26 Jul 2024 17:00:33 -0700
Subject: [PATCH 18/19] Update mlir/docs/Tutorials/MlirOpt.md

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/docs/Tutorials/MlirOpt.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 94745234f4345..2b514e46d3bbe 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -266,7 +266,8 @@ For more general information on pass management, see [Pass Infrastructure](/docs
  - `--emit-bytecode` emits MLIR in the bytecode format.
  - `--mlir-pass-statistics` print statistics about the passes run.
     These are generated via [pass statistics](/docs/PassManagement/#pass-statistics).
- - `--mlir-print-ir-after-all` prints the IR after each pass.
+ - `--mlir-print-ir-after-all` prints the IR after each pass,
+     and ` --mlir-print-ir-tree-dir` dumps it on disk.
     See also `--mlir-print-ir-after-change` and `--mlir-print-ir-after-failure`
  - `--mlir-timing` displays execution times of each pass.
 

>From 063783a3a9aae13e17012b811e5247da9b203b00 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <j2kun at users.noreply.github.com>
Date: Fri, 26 Jul 2024 17:10:24 -0700
Subject: [PATCH 19/19] use explicit module, explain anchoring for performance

---
 mlir/docs/Tutorials/MlirOpt.md                | 56 ++++++++++++-------
 mlir/test/Examples/mlir-opt/ctlz.mlir         |  8 ++-
 mlir/test/Examples/mlir-opt/loop_fusion.mlir  | 38 +++++++------
 .../mlir-opt/loop_fusion_default.mlir         | 38 +++++++------
 .../mlir-opt/loop_fusion_options.mlir         | 38 +++++++------
 5 files changed, 100 insertions(+), 78 deletions(-)

diff --git a/mlir/docs/Tutorials/MlirOpt.md b/mlir/docs/Tutorials/MlirOpt.md
index 2b514e46d3bbe..61e422cc1229c 100644
--- a/mlir/docs/Tutorials/MlirOpt.md
+++ b/mlir/docs/Tutorials/MlirOpt.md
@@ -42,9 +42,11 @@ on the following IR:
 
 ```mlir
 // mlir/test/Examples/mlir-opt/ctlz.mlir
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = math.ctlz %arg0 : i32
+    func.return %0 : i32
+  }
 }
 ```
 
@@ -76,25 +78,27 @@ Consider the following IR containing loops with poor cache locality.
 
 ```mlir
 // mlir/test/Examples/mlir-opt/loop_fusion.mlir
-func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-  %0 = memref.alloc() : memref<10xf32>
-  %1 = memref.alloc() : memref<10xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  affine.for %arg2 = 0 to 10 {
-    affine.store %cst, %0[%arg2] : memref<10xf32>
-    affine.store %cst, %1[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %0[%arg2] : memref<10xf32>
-    %3 = arith.addf %2, %2 : f32
-    affine.store %3, %arg0[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %1[%arg2] : memref<10xf32>
-    %3 = arith.mulf %2, %2 : f32
-    affine.store %3, %arg1[%arg2] : memref<10xf32>
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
   }
-  return
 }
 ```
 
@@ -249,6 +253,16 @@ module {
 }
 ```
 
+Specifying a pass pipeline with nested anchoring
+is also beneficial for performance reasons:
+passes with anchoring can run on IR subsets in parallel,
+which provides better threaded runtime and cache locality
+within threads.
+For example,
+even if a pass is not restricted to anchor on `func.func`,
+running `builtin.module(func.func(cse, canonicalize))`
+is more efficient than `builtin.module(cse, canonicalize)`.
+
 For a spec of the pass-pipeline textual description language,
 see [the docs](/docs/PassManagement/#textual-pass-pipeline-specification).
 For more general information on pass management, see [Pass Infrastructure](/docs/PassManagement/#).
diff --git a/mlir/test/Examples/mlir-opt/ctlz.mlir b/mlir/test/Examples/mlir-opt/ctlz.mlir
index da1a8af9cd42e..6e35010ca1c0b 100644
--- a/mlir/test/Examples/mlir-opt/ctlz.mlir
+++ b/mlir/test/Examples/mlir-opt/ctlz.mlir
@@ -2,7 +2,9 @@
 
 // CHECK-LABEL: @main
 // CHECK: llvm.intr.ctlz
-func.func @main(%arg0: i32) -> i32 {
-  %0 = math.ctlz %arg0 : i32
-  func.return %0 : i32
+module {
+  func.func @main(%arg0: i32) -> i32 {
+    %0 = math.ctlz %arg0 : i32
+    func.return %0 : i32
+  }
 }
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion.mlir b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
index 24a44d8a53f31..eec0d3fa57093 100644
--- a/mlir/test/Examples/mlir-opt/loop_fusion.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion.mlir
@@ -3,23 +3,25 @@
 
 // RUN: mlir-opt %s
 
-func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-  %0 = memref.alloc() : memref<10xf32>
-  %1 = memref.alloc() : memref<10xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  affine.for %arg2 = 0 to 10 {
-    affine.store %cst, %0[%arg2] : memref<10xf32>
-    affine.store %cst, %1[%arg2] : memref<10xf32>
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
   }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %0[%arg2] : memref<10xf32>
-    %3 = arith.addf %2, %2 : f32
-    affine.store %3, %arg0[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %1[%arg2] : memref<10xf32>
-    %3 = arith.mulf %2, %2 : f32
-    affine.store %3, %arg1[%arg2] : memref<10xf32>
-  }
-  return
 }
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
index 7c2cc99991f83..e5c86b84e43e5 100644
--- a/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_default.mlir
@@ -3,23 +3,25 @@
 
 // CHECK-LABEL: @producer_consumer_fusion
 // CHECK-COUNT-1: affine.for
-func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-  %0 = memref.alloc() : memref<10xf32>
-  %1 = memref.alloc() : memref<10xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  affine.for %arg2 = 0 to 10 {
-    affine.store %cst, %0[%arg2] : memref<10xf32>
-    affine.store %cst, %1[%arg2] : memref<10xf32>
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
   }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %0[%arg2] : memref<10xf32>
-    %3 = arith.addf %2, %2 : f32
-    affine.store %3, %arg0[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %1[%arg2] : memref<10xf32>
-    %3 = arith.mulf %2, %2 : f32
-    affine.store %3, %arg1[%arg2] : memref<10xf32>
-  }
-  return
 }
diff --git a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
index 97a40f0ee843a..b5c06f83cfba3 100644
--- a/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
+++ b/mlir/test/Examples/mlir-opt/loop_fusion_options.mlir
@@ -2,23 +2,25 @@
 
 // CHECK-LABEL: @producer_consumer_fusion
 // CHECK-COUNT-3: affine.for
-func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
-  %0 = memref.alloc() : memref<10xf32>
-  %1 = memref.alloc() : memref<10xf32>
-  %cst = arith.constant 0.000000e+00 : f32
-  affine.for %arg2 = 0 to 10 {
-    affine.store %cst, %0[%arg2] : memref<10xf32>
-    affine.store %cst, %1[%arg2] : memref<10xf32>
+module {
+  func.func @producer_consumer_fusion(%arg0: memref<10xf32>, %arg1: memref<10xf32>) {
+    %0 = memref.alloc() : memref<10xf32>
+    %1 = memref.alloc() : memref<10xf32>
+    %cst = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 10 {
+      affine.store %cst, %0[%arg2] : memref<10xf32>
+      affine.store %cst, %1[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %0[%arg2] : memref<10xf32>
+      %3 = arith.addf %2, %2 : f32
+      affine.store %3, %arg0[%arg2] : memref<10xf32>
+    }
+    affine.for %arg2 = 0 to 10 {
+      %2 = affine.load %1[%arg2] : memref<10xf32>
+      %3 = arith.mulf %2, %2 : f32
+      affine.store %3, %arg1[%arg2] : memref<10xf32>
+    }
+    return
   }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %0[%arg2] : memref<10xf32>
-    %3 = arith.addf %2, %2 : f32
-    affine.store %3, %arg0[%arg2] : memref<10xf32>
-  }
-  affine.for %arg2 = 0 to 10 {
-    %2 = affine.load %1[%arg2] : memref<10xf32>
-    %3 = arith.mulf %2, %2 : f32
-    affine.store %3, %arg1[%arg2] : memref<10xf32>
-  }
-  return
 }