[llvm] adding mlir_issue folder (PR #66052)

Tue Sep 12 00:47:23 PDT 2023

https://github.com/SwapnilGhanshyala created https://github.com/llvm/llvm-project/pull/66052:

None

>From 3135d2da12823ddffe20291afad23f15861967dd Mon Sep 17 00:00:00 2001
From: swapnilghanshyala <swapnilghanshyala at gmail.com>
Date: Tue, 12 Sep 2023 13:11:27 +0530
Subject: [PATCH] adding mlir_issue folder

---
 mlir_issue/bug_report.txt |  61 +++
 mlir_issue/mlir-opt-help  | 887 ++++++++++++++++++++++++++++++++++++++
 mlir_issue/temp.mlir      |  17 +
 3 files changed, 965 insertions(+)
 create mode 100644 mlir_issue/bug_report.txt
 create mode 100644 mlir_issue/mlir-opt-help
 create mode 100644 mlir_issue/temp.mlir

diff --git a/mlir_issue/bug_report.txt b/mlir_issue/bug_report.txt
new file mode 100644
index 000000000000000..064e3934cbc55ca
--- /dev/null
+++ b/mlir_issue/bug_report.txt
@@ -0,0 +1,61 @@
+mlir-opt: /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/Support/MathExtras.h:378: uint64_t llvm::alignTo(uint64_t, uint64_t): 
+Assertion `Align != 0u && "Align can't be 0."' failed.
+PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
+Stack dump:
+0.      Program arguments: ../build/bin/mlir-opt --affine-loop-tile=cache-size=0 test.mlir
+ #0 0x00005638fd58721b backtrace (../build/bin/mlir-opt+0xb50221b)
+ #1 0x00005638fd7ee1a4 llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) /data/swapnil/Git_Clones/llvm-project/llvm/lib/Support/Unix/Signals.inc:727:8
+ #2 0x00005638fd7eecd3 PrintStackTraceSignalHandler(void*) /data/swapnil/Git_Clones/llvm-project/llvm/lib/Support/Unix/Signals.inc:798:1
+ #3 0x00005638fd7e9f4f llvm::sys::RunSignalHandlers() /data/swapnil/Git_Clones/llvm-project/llvm/lib/Support/Signals.cpp:106:18
+ #4 0x00005638fd7ef156 SignalHandler(int) /data/swapnil/Git_Clones/llvm-project/llvm/lib/Support/Unix/Signals.inc:0:3
+ #5 0x00007fe8c8c42520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
+ #6 0x00007fe8c8c96a7c __pthread_kill_implementation ./nptl/pthread_kill.c:44:76
+ #7 0x00007fe8c8c96a7c __pthread_kill_internal ./nptl/pthread_kill.c:78:10
+ #8 0x00007fe8c8c96a7c pthread_kill ./nptl/pthread_kill.c:89:10
+ #9 0x00007fe8c8c42476 gsignal ./signal/../sysdeps/posix/raise.c:27:6
+#10 0x00007fe8c8c287f3 abort ./stdlib/abort.c:81:7
+#11 0x00007fe8c8c2871b _nl_load_domain ./intl/loadmsgcat.c:1177:9
+#12 0x00007fe8c8c39e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96)
+#13 0x00005638fd9ca306 llvm::alignTo(unsigned long, unsigned long) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/Support/MathExtras.h:378:3
+#14 0x00005638fd9ca306 llvm::divideCeil(unsigned long, unsigned long) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/Support/MathExtras.h:415:10
+#15 0x00005638fd9ca306 (anonymous namespace)::LoopTiling::getTileSizes(llvm::ArrayRef<mlir::affine::AffineForOp>, llvm::SmallVectorImpl<unsigned int>*) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp:208:27
+#16 0x00005638fd9ca306 (anonymous namespace)::LoopTiling::runOnOperation() /data/swapnil/Git_Clones/llvm-project/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp:254:5
+#17 0x000056390474040d mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_7::operator()() const /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:0:17
+#18 0x000056390474040d void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_7>(long) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:45:12
+#19 0x0000563904736ce6 mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:0:12
+#20 0x0000563904738348 mlir::LogicalResult::failed() const /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:44:33
+#21 0x0000563904738348 mlir::failed(mlir::LogicalResult) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:72:58
+#22 0x0000563904738348 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:569:9
+#23 0x00005639047412c5 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_15::operator()(mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo&) const /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:789:36
+#24 0x000056390473add2 mlir::LogicalResult::failed() const /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:44:33
+#25 0x000056390473add2 mlir::failed(mlir::LogicalResult) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:72:58
+#26 0x000056390473add2 mlir::LogicalResult mlir::failableParallelForEach<__gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_15&>(mlir::MLIRContext*, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, __gnu_cxx::__normal_iterator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>>, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_15&) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/IR/Threading.h:46:11
+#27 0x000056390473add2 mlir::LogicalResult mlir::failableParallelForEach<std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_15&>(mlir::MLIRContext*, std::vector<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo, std::allocator<mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::OpPMInfo>>&, mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool)::$_15&) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/IR/Threading.h:92:10
+#28 0x000056390473add2 mlir::detail::OpToOpPassAdaptor::runOnOperationAsyncImpl(bool) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:799:14
+#29 0x0000563904740397 mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:0:5
+#30 0x0000563904740397 mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_7::operator()() const /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:501:20
+#31 0x0000563904740397 void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_7>(long) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:45:12
+#32 0x0000563904736ce6 mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:0:12
+#33 0x0000563904738348 mlir::LogicalResult::failed() const /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:44:33
+#34 0x0000563904738348 mlir::failed(mlir::LogicalResult) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:72:58
+#35 0x0000563904738348 mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:569:9
+#36 0x000056390473d7ba mlir::PassManager::runPasses(mlir::Operation*, mlir::AnalysisManager) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:880:3
+#37 0x000056390473d526 mlir::PassManager::run(mlir::Operation*) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Pass/Pass.cpp:0:0
+#38 0x000056390470cc25 mlir::LogicalResult::failed() const /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:44:33
+#39 0x000056390470cc25 mlir::failed(mlir::LogicalResult) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:72:58
+#40 0x000056390470cc25 performActions(llvm::raw_ostream&, std::shared_ptr<llvm::SourceMgr> const&, mlir::MLIRContext*, mlir::MlirOptMainConfig const&) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:376:7
+#41 0x000056390470b658 processBuffer(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, mlir::MlirOptMainConfig const&, mlir::DialectRegistry&, llvm::ThreadPool*) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:431:12
+#42 0x000056390470b658 mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, mlir::DialectRegistry&, mlir::MlirOptMainConfig const&)::$_2::operator()(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) const /data/swapnil/Git_Clones/llvm-project/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:472:12
+#43 0x000056390470b658 mlir::LogicalResult llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>::callback_fn<mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, mlir::DialectRegistry&, mlir::MlirOptMainConfig const&)::$_2>(long, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:45:12
+#44 0x00005639049280ba llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>::operator()(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&) const /data/swapnil/Git_Clones/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:68:12
+#45 0x0000563904927365 mlir::splitAndProcessBuffer(std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::function_ref<mlir::LogicalResult (std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, llvm::raw_ostream&)>, llvm::raw_ostream&, bool, bool) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Support/ToolUtilities.cpp:28:12
+#46 0x0000563904707452 mlir::MlirOptMain(llvm::raw_ostream&, std::unique_ptr<llvm::MemoryBuffer, std::default_delete<llvm::MemoryBuffer>>, mlir::DialectRegistry&, mlir::MlirOptMainConfig const&) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:475:10
+#47 0x0000563904707c39 mlir::MlirOptMain(int, char**, llvm::StringRef, mlir::DialectRegistry&) /data/swapnil/Git_Clones/llvm-project/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp:531:14
+#48 0x00005638fd60bd13 mlir::LogicalResult::succeeded() const /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Support/LogicalResult.h:41:35
+#49 0x00005638fd60bd13 mlir::asMainReturnCode(mlir::LogicalResult) /data/swapnil/Git_Clones/llvm-project/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h:259:12
+#50 0x00005638fd60bd13 main /data/swapnil/Git_Clones/llvm-project/mlir/tools/mlir-opt/mlir-opt.cpp:286:10
+#51 0x00007fe8c8c29d90 __libc_start_call_main ./csu/../sysdeps/nptl/libc_start_call_main.h:58:16
+#52 0x00007fe8c8c29e40 call_init ./csu/../csu/libc-start.c:128:20
+#53 0x00007fe8c8c29e40 __libc_start_main ./csu/../csu/libc-start.c:379:5
+#54 0x00005638fd54b6e5 _start (../build/bin/mlir-opt+0xb4c66e5)
+Aborted (core dumped)
\ No newline at end of file
diff --git a/mlir_issue/mlir-opt-help b/mlir_issue/mlir-opt-help
new file mode 100644
index 000000000000000..0261783ff1f7f3b
--- /dev/null
+++ b/mlir_issue/mlir-opt-help
@@ -0,0 +1,887 @@
+OVERVIEW: MLIR modular optimizer driver
+
+Available Dialects: acc, affine, amdgpu, amx, arith, arm_neon, arm_sme, arm_sve, async, bufferization, builtin, cf, complex, dlti, emitc, func, gpu, index, irdl, linalg, llvm, math, memref, ml_program, nvgpu, nvvm, omp, pdl, pdl_interp, quant, rocdl, scf, shape, sparse_tensor, spirv, tensor, test, test_dyn, tosa, transform, ub, vector, x86vector
+USAGE: mlir-opt [options] <input file>
+
+OPTIONS:
+
+Color Options:
+
+  --color                                                     - Use colors in output (default=autodetect)
+
+General options:
+
+  --abort-on-max-devirt-iterations-reached                    - Abort when the max iterations for devirtualization CGSCC repeat pass is reached
+  --allow-unregistered-dialect                                - Allow operation with no registered dialects
+  --atomic-counter-update-promoted                            - Do counter update using atomic fetch add  for promoted counters only
+  --atomic-first-counter                                      - Use atomic fetch add for first counter in a function (usually the entry counter)
+  --bounds-checking-single-trap                               - Use one trap block per function
+  --cfg-hide-cold-paths=<number>                              - Hide blocks with relative frequency below the given value
+  --cfg-hide-deoptimize-paths                                 - 
+  --cfg-hide-unreachable-paths                                - 
+  --cost-kind=<value>                                         - Target cost kind
+    =throughput                                               -   Reciprocal throughput
+    =latency                                                  -   Instruction latency
+    =code-size                                                -   Code size
+    =size-latency                                             -   Code size and latency
+  --debug-info-correlate                                      - Use debug info to correlate profiles.
+  --debugify-func-limit=<ulong>                               - Set max number of processed functions per pass.
+  --debugify-level=<value>                                    - Kind of debug info to add
+    =locations                                                -   Locations only
+    =location+variables                                       -   Locations and Variables
+  --debugify-quiet                                            - Suppress verbose debugify output
+  --disable-auto-upgrade-debug-info                           - Disable autoupgrade of debug info
+  --disable-i2p-p2i-opt                                       - Disables inttoptr/ptrtoint roundtrip optimization
+  --do-counter-promotion                                      - Do counter register promotion
+  --dot-cfg-mssa=<file name for generated dot file>           - file name for generated dot file
+  --dump-pass-pipeline                                        - Print the pipeline that will be run
+  --emit-bytecode                                             - Emit bytecode when generating output
+  --emit-bytecode-version=<value>                             - Use specified bytecode when generating output
+  --enable-gvn-hoist                                          - Enable the GVN hoisting pass (default = off)
+  --enable-gvn-memdep                                         - 
+  --enable-gvn-sink                                           - Enable the GVN sinking pass (default = off)
+  --enable-load-in-loop-pre                                   - 
+  --enable-load-pre                                           - 
+  --enable-loop-simplifycfg-term-folding                      - 
+  --enable-name-compression                                   - Enable name/filename string compression
+  --enable-split-backedge-in-load-pre                         - 
+  --experimental-debug-variable-locations                     - Use experimental new value-tracking variable locations
+  --force-tail-folding-style=<value>                          - Force the tail folding style
+    =none                                                     -   Disable tail folding
+    =data                                                     -   Create lane mask for data only, using active.lane.mask intrinsic
+    =data-without-lane-mask                                   -   Create lane mask with compare/stepvector
+    =data-and-control                                         -   Create lane mask using active.lane.mask intrinsic, and use it for both data and control flow
+    =data-and-control-without-rt-check                        -   Similar to data-and-control, but remove the runtime check
+  --fs-profile-debug-bw-threshold=<uint>                      - Only show debug message if the source branch weight is greater  than this value.
+  --fs-profile-debug-prob-diff-threshold=<uint>               - Only show debug message if the branch probility is greater than this value (in percentage).
+  --generate-merged-base-profiles                             - When generating nested context-sensitive profiles, always generate extra base profile for function with all its context profiles merged into it.
+  --hash-based-counter-split                                  - Rename counter variable of a comdat function based on cfg hash
+  --hot-cold-split                                            - Enable hot-cold splitting pass
+  --import-all-index                                          - Import all external functions in index.
+  --instcombine-code-sinking                                  - Enable code sinking
+  --instcombine-guard-widening-window=<uint>                  - How wide an instruction window to bypass looking for another guard
+  --instcombine-max-num-phis=<uint>                           - Maximum number phis to handle in intptr/ptrint folding
+  --instcombine-max-sink-users=<uint>                         - Maximum number of undroppable users for instruction sinking
+  --instcombine-maxarray-size=<uint>                          - Maximum array size considered when doing a combine
+  --instcombine-negator-enabled                               - Should we attempt to sink negations?
+  --instcombine-negator-max-depth=<uint>                      - What is the maximal lookup depth when trying to check for viability of negation sinking.
+  --instrprof-atomic-counter-update-all                       - Make all profile counter updates atomic (for testing only)
+  --internalize-public-api-file=<filename>                    - A file containing list of symbol names to preserve
+  --internalize-public-api-list=<list>                        - A list of symbol names to preserve
+  --irdl-file=<filename>                                      - IRDL file to register before processing the input
+  --iterative-counter-promotion                               - Allow counter promotion across the whole loop nest.
+  --load-dialect-plugin=<string>                              - Load dialects from plugin library
+  --load-pass-plugin=<string>                                 - Load passes from plugin library
+  --log-actions-to=<string>                                   - Log action execution to a file, or stderr if  '-' is passed
+  --log-mlir-actions-filter=<string>                          - Comma separated list of locations to filter actions from logging
+  --matrix-default-layout=<value>                             - Sets the default matrix layout
+    =column-major                                             -   Use column-major layout
+    =row-major                                                -   Use row-major layout
+  --matrix-print-after-transpose-opt                          - 
+  --max-counter-promotions=<int>                              - Max number of allowed counter promotions
+  --max-counter-promotions-per-loop=<uint>                    - Max number counter promotions per loop to avoid increasing register pressure too much
+  --mir-strip-debugify-only                                   - Should mir-strip-debug only strip debug info from debugified modules by default
+  --misexpect-tolerance=<uint>                                - Prevents emiting diagnostics when profile counts are within N% of the threshold..
+  --mlir-debug-counter=<string>                               - Comma separated list of debug counter skip and count arguments
+  --mlir-disable-threading                                    - Disable multi-threading within MLIR, overrides any further call to MLIRContext::enableMultiThreading()
+  --mlir-elide-elementsattrs-if-larger=<uint>                 - Elide ElementsAttrs with "..." that have more elements than the given upper limit
+  --mlir-elide-resource-strings-if-larger=<uint>              - Elide printing value of resources if string is too long in chars.
+  --mlir-enable-debugger-hook                                 - Enable Debugger hook for debugging MLIR Actions
+  --mlir-pass-pipeline-crash-reproducer=<string>              - Generate a .mlir reproducer file at the given output path if the pass manager crashes or fails
+  --mlir-pass-pipeline-local-reproducer                       - When generating a crash reproducer, attempt to generated a reproducer with the smallest pipeline.
+  --mlir-pass-statistics                                      - Display the statistics of each pass
+  --mlir-pass-statistics-display=<value>                      - Display method for pass statistics
+    =list                                                     -   display the results in a merged list sorted by pass name
+    =pipeline                                                 -   display the results with a nested pipeline view
+  --mlir-pretty-debuginfo                                     - Print pretty debug info in MLIR output
+  --mlir-print-debug-counter                                  - Print out debug counter information after all counters have been accumulated
+  --mlir-print-debuginfo                                      - Print debug info in MLIR output
+  --mlir-print-elementsattrs-with-hex-if-larger=<long>        - Print DenseElementsAttrs with a hex string that have more elements than the given upper limit (use -1 to disable)
+  --mlir-print-ir-after=<pass-arg>                            - Print IR after specified passes
+  --mlir-print-ir-after-all                                   - Print IR after each pass
+  --mlir-print-ir-after-change                                - When printing the IR after a pass, only print if the IR changed
+  --mlir-print-ir-after-failure                               - When printing the IR after a pass, only print if the pass failed
+  --mlir-print-ir-before=<pass-arg>                           - Print IR before specified passes
+  --mlir-print-ir-before-all                                  - Print IR before each pass
+  --mlir-print-ir-module-scope                                - When printing IR for print-ir-[before|after]{-all} always print the top-level operation
+  --mlir-print-local-scope                                    - Print with local scope and inline information (eliding aliases for attributes, types, and locations
+  --mlir-print-op-on-diagnostic                               - When a diagnostic is emitted on an operation, also print the operation as an attached note
+  --mlir-print-stacktrace-on-diagnostic                       - When a diagnostic is emitted, also print the stack trace as an attached note
+  --mlir-print-value-users                                    - Print users of operation results and block arguments as a comment
+  --mlir-timing                                               - Display execution times
+  --mlir-timing-display=<value>                               - Display method for timing data
+    =list                                                     -   display the results in a list sorted by total time
+    =tree                                                     -   display the results ina with a nested tree view
+  --no-discriminators                                         - Disable generation of discriminator information.
+  --no-implicit-module                                        - Disable implicit addition of a top-level module op during parsing
+  -o <filename>                                               - Output filename
+  --pass-pipeline=<string>                                    - Textual description of the pass pipeline to run
+  --pgo-block-coverage                                        - Use this option to enable basic block coverage instrumentation
+  --pgo-temporal-instrumentation                              - Use this option to enable temporal instrumentation
+  --pgo-view-block-coverage-graph                             - Create a dot file of CFGs with block coverage inference information
+  --poison-checking-function-local                            - Check that returns are non-poison (for testing)
+  --print-pipeline-passes                                     - Print a '-passes' compatible string describing the pipeline (best-effort only).
+  --run-reproducer                                            - Run the pipeline stored in the reproducer
+  --runtime-counter-relocation                                - Enable relocating counters at runtime.
+  --safepoint-ir-verifier-print-only                          - 
+  --sample-profile-check-record-coverage=<N>                  - Emit a warning if less than N% of records in the input profile are matched to the IR.
+  --sample-profile-check-sample-coverage=<N>                  - Emit a warning if less than N% of samples in the input profile are matched to the IR.
+  --sample-profile-max-propagate-iterations=<uint>            - Maximum number of iterations to go through when propagating sample block/edge weights through the CFG.
+  --show-dialects                                             - Print the list of registered dialects and exit
+  --skip-ret-exit-block                                       - Suppress counter promotion if exit blocks contain ret.
+  --speculative-counter-promotion-max-exiting=<uint>          - The max number of exiting blocks of a loop to allow  speculative counter promotion
+  --speculative-counter-promotion-to-loop                     - When the option is false, if the target block is in a loop, the promotion will be disallowed unless the promoted counter  update can be further/iteratively promoted into an acyclic  region.
+  --split-input-file                                          - Split the input file into pieces and process each chunk independently
+  --summary-file=<string>                                     - The summary file to use for function importing.
+  Compiler passes to run
+    Passes:
+      --affine-data-copy-generate                             -   Generate explicit copying for affine memory operations
+        --fast-mem-capacity=<ulong>                           - Set fast memory space capacity in KiB (default: unlimited)
+        --fast-mem-space=<uint>                               - Fast memory space identifier for copy generation (default: 1)
+        --generate-dma                                        - Generate DMA instead of point-wise copy
+        --min-dma-transfer=<int>                              - Minimum DMA transfer size supported by the target in bytes
+        --skip-non-unit-stride-loops                          - Testing purposes: avoid non-unit stride loop choice depths for copy placement
+        --slow-mem-space=<uint>                               - Slow memory space identifier for copy generation (default: 0)
+        --tag-mem-space=<uint>                                - Tag memory space identifier for copy generation (default: 0)
+      --affine-expand-index-ops                               -   Lower affine operations operating on indices into more fundamental operations
+      --affine-loop-coalescing                                -   Coalesce nested loops with independent bounds into a single loop
+      --affine-loop-fusion                                    -   Fuse affine loop nests
+        --fusion-compute-tolerance=<number>                   - Fractional increase in additional computation tolerated while fusing
+        --fusion-fast-mem-space=<uint>                        - Faster memory space number to promote fusion buffers to
+        --fusion-local-buf-threshold=<ulong>                  - Threshold size (KiB) for promoting local buffers to fast memory space
+        --fusion-maximal                                      - Enables maximal loop fusion
+        --mode=<value>                                        - fusion mode to attempt
+    =greedy                                             -   Perform greedy (both producer-consumer and sibling)  fusion
+    =producer                                           -   Perform only producer-consumer fusion
+    =sibling                                            -   Perform only sibling fusion
+      --affine-loop-invariant-code-motion                     -   Hoist loop invariant instructions outside of affine loops
+      --affine-loop-normalize                                 -   Apply normalization transformations to affine loop-like ops
+        --promote-single-iter                                 - Promote single iteration loops
+      --affine-loop-tile                                      -   Tile affine loop nests
+        --cache-size=<ulong>                                  - Set size of cache to tile for in KiB (default: 512)
+        --separate                                            - Separate full and partial tiles (default: false)
+        --tile-size=<uint>                                    - Use this tile size for all loops
+        --tile-sizes=<uint>                                   - List of tile sizes for each perfect nest (overridden by -tile-size)
+      --affine-loop-unroll                                    -   Unroll affine loops
+        --cleanup-unroll                                      - Fully unroll the cleanup loop when possible.
+        --unroll-factor=<uint>                                - Use this unroll factor for all loops being unrolled
+        --unroll-full                                         - Fully unroll loops
+        --unroll-full-threshold=<uint>                        - Unroll all loops with trip count less than or equal to this
+        --unroll-num-reps=<uint>                              - Unroll innermost loops repeatedly this many times
+        --unroll-up-to-factor                                 - Allow unrolling up to the factor specified
+      --affine-loop-unroll-jam                                -   Unroll and jam affine loops
+        --unroll-jam-factor=<uint>                            - Use this unroll jam factor for all loops (default 4)
+      --affine-parallelize                                    -   Convert affine.for ops into 1-D affine.parallel
+        --max-nested=<uint>                                   - Maximum number of nested parallel loops to produce. Defaults to unlimited (UINT_MAX).
+        --parallel-reductions                                 - Whether to parallelize reduction loops. Defaults to false.
+      --affine-pipeline-data-transfer                         -   Pipeline non-blocking data transfers between explicitly managed levels of the memory hierarchy
+      --affine-scalrep                                        -   Replace affine memref accesses by scalars by forwarding stores to loads and eliminating redundant loads
+      --affine-simplify-structures                            -   Simplify affine expressions in maps/sets and normalize memrefs
+      --affine-super-vectorize                                -   Vectorize to a target independent n-D vector abstraction
+        --test-fastest-varying=<long>                         - Specify a 1-D, 2-D or 3-D pattern of fastest varying memory dimensions to match. See defaultPatterns in Vectorize.cpp for a description and examples. This is used for testing purposes
+        --vectorize-reductions                                - Vectorize known reductions expressed via iter_args. Switched off by default.
+        --virtual-vector-size=<long>                          - Specify an n-D virtual vector size for vectorization
+      --affine-super-vectorizer-test                          -   Tests vectorizer standalone functionality.
+      --allocate-arm-sme-tiles                                -   Allocate SME tiles
+      --amdgpu-emulate-atomics                                -   Emulate atomic operations on chipsets that do not support them
+        --chipset=<string>                                    - Chipset that these operations will run on
+      --arith-bufferize                                       -   Bufferize Arith dialect ops.
+        --alignment=<uint>                                    - Create global memrefs with a specified alignment
+      --arith-emulate-unsupported-floats                      -   Emulate operations on unsupported floats with extf/truncf
+        --source-types=<string>                               - MLIR types without arithmetic support on a given target
+        --target-type=<string>                                - MLIR type to convert the unsupported source types to
+      --arith-emulate-wide-int                                -   Emulate 2*N-bit integer operations using N-bit operations
+        --widest-int-supported=<uint>                         - Widest integer type supported by the target
+      --arith-expand                                          -   Legalize Arith ops to be convertible to LLVM.
+        --include-bf16                                        - Enable the BF16 expansion patterns
+      --arith-int-narrowing                                   -   Reduce integer operation bitwidth
+        --int-bitwidths-supported=<uint>                      - Integer bitwidths supported
+      --arith-unsigned-when-equivalent                        -   Replace signed ops with unsigned ones where they are proven equivalent
+      --arm-neon-2d-to-intr                                   -   Convert Arm NEON structured ops to intrinsics
+      --async-func-to-async-runtime                           -   Lower async.func operations to the explicit async.runtime andasync.coro operations
+      --async-parallel-for                                    -   Convert scf.parallel operations to multiple async compute ops executed concurrently for non-overlapping iteration ranges
+        --async-dispatch                                      - Dispatch async compute tasks using recursive work splitting. If `false` async compute tasks will be launched using simple for loop in the caller thread.
+        --min-task-size=<int>                                 - The minimum task size for sharding parallel operation.
+        --num-workers=<int>                                   - The number of available workers to execute async operations. If `-1` the value will be retrieved from the runtime.
+      --async-runtime-policy-based-ref-counting               -   Policy based reference counting for Async runtime operations
+      --async-runtime-ref-counting                            -   Automatic reference counting for Async runtime operations
+      --async-runtime-ref-counting-opt                        -   Optimize automatic reference counting operations for theAsync runtime by removing redundant operations
+      --async-to-async-runtime                                -   Lower all high level async operations (e.g. async.execute) tothe explicit async.runtime and async.coro operations
+      --buffer-deallocation                                   -   Adds all required dealloc operations for all allocations in the input program
+      --buffer-deallocation-simplification                    -   Optimizes `bufferization.dealloc` operation for more efficient codegen
+      --buffer-hoisting                                       -   Optimizes placement of allocation operations by moving them into common dominators and out of nested regions
+      --buffer-loop-hoisting                                  -   Optimizes placement of allocation operations by moving them out of loop nests
+      --buffer-results-to-out-params                          -   Converts memref-typed function results to out-params
+      --bufferization-bufferize                               -   Bufferize the `bufferization` dialect
+      --canonicalize                                          -   Canonicalize operations
+        --disable-patterns=<string>                           - Labels of patterns that should be filtered out during application
+        --enable-patterns=<string>                            - Labels of patterns that should be used during application, all other patterns are filtered out
+        --max-iterations=<long>                               - Max. iterations between applying patterns / simplifying regions
+        --max-num-rewrites=<long>                             - Max. number of pattern rewrites within an iteration
+        --region-simplify                                     - Perform control flow optimizations to the region tree
+        --test-convergence                                    - Test only: Fail pass on non-convergence to detect cyclic pattern
+        --top-down                                            - Seed the worklist in general top-down order
+      --control-flow-sink                                     -   Sink operations into conditional blocks
+      --convert-affine-for-to-gpu                             -   Convert top-level AffineFor Ops to GPU kernels
+        --gpu-block-dims=<uint>                               - Number of GPU block dimensions for mapping
+        --gpu-thread-dims=<uint>                              - Number of GPU thread dimensions for mapping
+      --convert-amdgpu-to-rocdl                               -   Convert AMDGPU dialect to ROCDL dialect
+        --chipset=<string>                                    - Chipset that these operations will run on
+      --convert-arith-to-llvm                                 -   Convert Arith dialect to LLVM dialect
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+      --convert-arith-to-spirv                                -   Convert Arith dialect to SPIR-V dialect
+        --emulate-lt-32-bit-scalar-types                      - Emulate narrower scalar types with 32-bit ones if not supported by the target
+        --enable-fast-math                                    - Enable fast math mode (assuming no NaN and infinity for floating point values) when performing conversion
+      --convert-arm-sme-to-scf                                -   Lower the operations from the ArmSME dialect into the SCF dialect
+      --convert-async-to-llvm                                 -   Convert the operations from the async dialect into the LLVM dialect
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-bufferization-to-memref                       -   Convert operations from the Bufferization dialect to the MemRef dialect
+      --convert-cf-to-llvm                                    -   Convert ControlFlow operations to the LLVM dialect
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-cf-to-spirv                                   -   Convert ControlFlow dialect to SPIR-V dialect
+        --emulate-lt-32-bit-scalar-types                      - Emulate narrower scalar types with 32-bit ones if not supported by the target
+      --convert-complex-to-libm                               -   Convert Complex dialect to libm calls
+      --convert-complex-to-llvm                               -   Convert Complex dialect to LLVM dialect
+      --convert-complex-to-spirv                              -   Convert Complex dialect to SPIRV dialect
+      --convert-complex-to-standard                           -   Convert Complex dialect to standard dialect
+      --convert-elementwise-to-linalg                         -   Convert ElementwiseMappable ops to linalg
+      --convert-func-to-llvm                                  -   Convert from the Func dialect to the LLVM dialect
+        --data-layout=<string>                                - String description (LLVM format) of the data layout that is expected on the produced module
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+        --use-bare-ptr-memref-call-conv                       - Replace FuncOp's MemRef arguments with bare pointers to the MemRef element types
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-func-to-spirv                                 -   Convert Func dialect to SPIR-V dialect
+        --emulate-lt-32-bit-scalar-types                      - Emulate narrower scalar types with 32-bit ones if not supported by the target
+      --convert-gpu-launch-to-vulkan-launch                   -   Convert gpu.launch_func to vulkanLaunch external call
+      --convert-gpu-to-nvvm                                   -   Generate NVVM operations for gpu operations
+        --has-redux                                           - Target gpu supports redux
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+        --use-bare-ptr-memref-call-conv                       - Replace memref arguments in GPU functions with bare pointers. All memrefs must have static shape.
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-gpu-to-rocdl                                  -   Generate ROCDL operations for gpu operations
+        --chipset=<string>                                    - Chipset that these operations will run on
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+        --runtime=<value>                                     - Runtime code will be run on (default is Unknown, can also use HIP or OpenCl)
+    =unknown                                            -   Unknown (default)
+    =HIP                                                -   HIP
+    =OpenCL                                             -   OpenCL
+        --use-bare-ptr-memref-call-conv                       - Replace memref arguments in GPU functions with bare pointers.All memrefs must have static shape
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-gpu-to-spirv                                  -   Convert GPU dialect to SPIR-V dialect
+        --use-64bit-index                                     - Use 64-bit integers to convert index types
+      --convert-index-to-llvm                                 -   Lower the `index` dialect to the `llvm` dialect.
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+      --convert-linalg-to-affine-loops                        -   Lower the operations from the linalg dialect into affine loops
+      --convert-linalg-to-loops                               -   Lower the operations from the linalg dialect into loops
+      --convert-linalg-to-parallel-loops                      -   Lower the operations from the linalg dialect into parallel loops
+      --convert-linalg-to-std                                 -   Convert the operations from the linalg dialect into the Standard dialect
+      --convert-math-to-funcs                                 -   Convert Math operations to calls of outlined implementations.
+        --convert-ctlz                                        - Convert math.ctlz to a software implementation. Enable for targets that do not natively support ctlz.
+        --min-width-of-fpowi-exponent=<uint>                  - Convert FPowI only if the width of its exponent's integer type is greater than or equal to this value
+      --convert-math-to-libm                                  -   Convert Math dialect to libm calls
+      --convert-math-to-llvm                                  -   Convert Math dialect to LLVM dialect
+        --approximate-log1p                                   - Enable approximation of Log1p.
+      --convert-math-to-spirv                                 -   Convert Math dialect to SPIR-V dialect
+      --convert-memref-to-spirv                               -   Convert MemRef dialect to SPIR-V dialect
+        --bool-num-bits=<int>                                 - The number of bits to store a boolean value
+        --use-64bit-index                                     - Use 64-bit integers to convert index types
+      --convert-nvgpu-to-nvvm                                 -   Convert NVGPU dialect to NVVM dialect
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-nvvm-to-llvm                                  -   Convert NVVM dialect to LLVM dialect
+      --convert-openacc-to-scf                                -   Convert the OpenACC ops to OpenACC with SCF dialect
+      --convert-openmp-to-llvm                                -   Convert the OpenMP ops to OpenMP ops with LLVM dialect
+      --convert-parallel-loops-to-gpu                         -   Convert mapped scf.parallel ops to gpu launch operations
+      --convert-pdl-to-pdl-interp                             -   Convert PDL ops to PDL interpreter ops
+      --convert-scf-to-cf                                     -   Convert SCF dialect to ControlFlow dialect, replacing structured control flow with a CFG
+      --convert-scf-to-openmp                                 -   Convert SCF parallel loop to OpenMP parallel + workshare constructs.
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-scf-to-spirv                                  -   Convert SCF dialect to SPIR-V dialect.
+      --convert-shape-constraints                             -   Convert shape constraint operations to the standard dialect
+      --convert-shape-to-std                                  -   Convert operations from the shape dialect into the standard dialect
+      --convert-spirv-to-llvm                                 -   Convert SPIR-V dialect to LLVM dialect
+        --client-api=<value>                                  - Derive StorageClass to address space mapping from the client API
+    =Unknown                                            -   Unknown (default)
+    =Metal                                              -   Metal
+    =OpenCL                                             -   OpenCL
+    =Vulkan                                             -   Vulkan
+    =WebGPU                                             -   WebGPU
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-tensor-to-linalg                              -   Convert some Tensor dialect ops to Linalg dialect
+      --convert-tensor-to-spirv                               -   Convert Tensor dialect to SPIR-V dialect
+        --emulate-lt-32-bit-scalar-types                      - Emulate narrower scalar types with 32-bit ones if not supported by the target
+      --convert-to-llvm                                       -   Convert to LLVM via dialect interfaces found in the input IR
+        --filter-dialects=<string>                            - Test conversion patterns of only the specified dialects
+      --convert-ub-to-llvm                                    -   Convert UB dialect to LLVM dialect
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+      --convert-ub-to-spirv                                   -   Convert UB dialect to SPIR-V dialect
+      --convert-vector-to-arm-sme                             -   Lower the operations from the vector dialect into the ArmSME dialect
+      --convert-vector-to-gpu                                 -   Lower the operations from the vector dialect into the GPU dialect
+        --use-nvgpu                                           - convert to NvGPU ops instead of GPU dialect ops
+      --convert-vector-to-llvm                                -   Lower the operations from the vector dialect into the LLVM dialect
+        --enable-amx                                          - Enables the use of AMX dialect while lowering the vector dialect.
+        --enable-arm-neon                                     - Enables the use of ArmNeon dialect while lowering the vector dialect.
+        --enable-arm-sme                                      - Enables the use of ArmSME dialect while lowering the vector dialect.
+        --enable-arm-sve                                      - Enables the use of ArmSVE dialect while lowering the vector dialect.
+        --enable-x86vector                                    - Enables the use of X86Vector dialect while lowering the vector dialect.
+        --force-32bit-vector-indices                          - Allows compiler to assume vector indices fit in 32-bit if that yields faster code
+        --reassociate-fp-reductions                           - Allows llvm to reassociate floating-point reductions for speed
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --convert-vector-to-scf                                 -   Lower the operations from the vector dialect into the SCF dialect
+        --full-unroll                                         - Perform full unrolling when converting vector transfers to SCF
+        --lower-tensors                                       - Lower transfer ops that operate on tensors
+        --target-rank=<uint>                                  - Target vector rank to which transfer ops should be lowered
+      --convert-vector-to-spirv                               -   Convert Vector dialect to SPIR-V dialect
+      --cse                                                   -   Eliminate common sub-expressions
+      --decorate-spirv-composite-type-layout                  -   Decorate SPIR-V composite type with layout info
+      --drop-equivalent-buffer-results                        -   Remove MemRef return values that are equivalent to a bbArg
+      --duplicate-function-elimination                        -   Deduplicate functions
+      --eliminate-empty-tensors                               -   Try to eliminate all tensor.empty ops.
+      --empty-tensor-to-alloc-tensor                          -   Replace all empty ops by alloc_tensor ops.
+      --enable-arm-streaming                                  -   Enable Armv9 Streaming SVE mode
+        --enable-za                                           - Enable ZA storage array.
+        --mode=<value>                                        - Select how streaming-mode is managed at the function-level.
+    =default                                            -   Streaming mode is part of the function interface (ABI), caller manages PSTATE.SM on entry/exit.
+    =locally                                            -   Streaming mode is internal to the function, callee manages PSTATE.SM on entry/exit.
+      --ensure-debug-info-scope-on-llvm-func                  -   Materialize LLVM debug info subprogram attribute on every LLVMFuncOp
+      --expand-strided-metadata                               -   Expand memref operations into easier to analyze constructs
+      --finalize-memref-to-llvm                               -   Finalize MemRef dialect to LLVM dialect conversion
+        --index-bitwidth=<uint>                               - Bitwidth of the index type, 0 to use size of machine word
+        --use-aligned-alloc                                   - Use aligned_alloc in place of malloc for heap allocations
+        --use-generic-functions                               - Use generic allocation and deallocation functions instead of the classic 'malloc', 'aligned_alloc' and 'free' functions
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --finalizing-bufferize                                  -   Finalize a partial bufferization
+      --fold-memref-alias-ops                                 -   Fold memref alias ops into consumer load/store ops
+      --fold-tensor-subset-ops                                -   Fold tensor subset ops into producer/consumer ops
+      --func-bufferize                                        -   Bufferize func/call/return ops
+      --generate-runtime-verification                         -   Generate additional runtime op verification checks
+      --gpu-async-region                                      -   Make GPU ops async
+      --gpu-decompose-memrefs                                 -   Decomposes memref index computation into explicit ops.
+      --gpu-kernel-outlining                                  -   Outline gpu.launch bodies to kernel functions
+        --data-layout-str=<string>                            - String containing the data layout specification to be attached to the GPU kernel module
+      --gpu-launch-sink-index-computations                    -   Sink index computations into gpu.launch body
+      --gpu-map-parallel-loops                                -   Greedily maps loops to GPU hardware dimensions.
+      --gpu-module-to-binary                                  -   Transforms a GPU module into a GPU binary.
+        --format=<string>                                     - The target representation of the compilation process.
+        --handler=<value>                                     - Offloading handler to be attached to the resulting binary op.
+        -l <string>                                           - Extra files to link to.
+        --opts=<string>                                       - Command line options to pass to the tools.
+        --toolkit=<string>                                    - Toolkit path.
+      --gpu-to-llvm                                           -   Convert GPU dialect to LLVM dialect with GPU runtime calls
+        --gpu-binary-annotation=<string>                      - Annotation attribute string for GPU binary
+        --use-bare-pointers-for-host                          - Use bare pointers to pass memref arguments to host functions. All memrefs must have static shape.
+        --use-bare-pointers-for-kernels                       - Use bare pointers to pass memref arguments to kernels. The kernel must use the same setting for this option.
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --inline                                                -   Inline function calls
+        --default-pipeline=<string>                           - The default optimizer pipeline used for callables
+        --max-iterations=<uint>                               - Maximum number of iterations when inlining within an SCC
+        --op-pipelines=<pass-manager>                         - Callable operation specific optimizer pipelines (in the form of `dialect.op(pipeline)`)
+      --int-range-optimizations                               -   Do optimizations based on integer range analysis
+      --launch-func-to-vulkan                                 -   Convert vulkanLaunch external call to Vulkan runtime external calls
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --lift-cf-to-scf                                        -   Lift ControlFlow dialect to SCF dialect
+      --linalg-bufferize                                      -   Bufferize the linalg dialect
+      --linalg-detensorize                                    -   Detensorize linalg ops
+        --aggressive-mode                                     - Detensorize all ops that qualify for detensoring along with branch operands and basic-block arguments.
+      --linalg-fold-unit-extent-dims                          -   Remove unit-extent dimension in Linalg ops on tensors
+        --use-rank-reducing-slices                            - Generate rank-reducing slices instead of reassociative reshapes
+      --linalg-fuse-elementwise-ops                           -   Fuse elementwise operations on tensors
+      --linalg-generalize-named-ops                           -   Convert named ops into generic ops
+      --linalg-inline-scalar-operands                         -   Inline scalar operands into linalg generic ops
+      --linalg-named-op-conversion                            -   Convert from one named linalg op to another.
+      --llvm-legalize-for-export                              -   Legalize LLVM dialect to be convertible to LLVM IR
+      --llvm-optimize-for-nvvm-target                         -   Optimize NVVM IR
+      --llvm-request-c-wrappers                               -   Request C wrapper emission for all functions
+      --llvm-type-consistency                                 -   Rewrites to improve type consistency
+        --max-vector-split-size=<uint>                        - Maximum size in bits of a vector value in a load or store operation operating on multiple elements that should still be split
+      --loop-invariant-code-motion                            -   Hoist loop invariant instructions outside of the loop
+      --lower-affine                                          -   Lower Affine operations to a combination of Standard and SCF operations
+      --lower-host-to-llvm                                    -   Lowers the host module code and `gpu.launch_func` to LLVM
+        --use-opaque-pointers                                 - Generate LLVM IR using opaque pointers instead of typed pointers
+      --lower-vector-mask                                     -   Lower 'vector.mask' operations
+      --map-memref-spirv-storage-class                        -   Map numeric MemRef memory spaces to SPIR-V storage classes
+        --client-api=<string>                                 - The client API to use for populating mappings
+      --math-uplift-to-fma                                    -   Uplift arith ops to math.fma.
+      --mem2reg                                               -   Promotes memory slots into values.
+        --region-simplify                                     - Perform control flow optimizations to the region tree
+      --memref-emulate-wide-int                               -   Emulate 2*N-bit integer operations using N-bit operations
+        --widest-int-supported=<uint>                         - Widest integer type supported by the target
+      --memref-expand                                         -   Legalize memref operations to be convertible to LLVM.
+      --normalize-memrefs                                     -   Normalize memrefs
+      --nvgpu-optimize-shared-memory                          -   Optimizes accesses to shard memory memrefs in order to reduce bank conflicts.
+      --nvvm-attach-target                                    -   Attaches an NVVM target attribute to a GPU Module.
+        -O <uint>                                             - Optimization level.
+        --chip=<string>                                       - Target chip.
+        --fast                                                - Enable fast math mode.
+        --features=<string>                                   - Target features.
+        --ftz                                                 - Enable flush to zero for denormals.
+        -l <string>                                           - Extra bitcode libraries paths to link to.
+        --module=<string>                                     - Regex used to identify the modules to attach the target to.
+        --triple=<string>                                     - Target triple.
+      --one-shot-bufferize                                    -   One-Shot Bufferize
+        --allow-return-allocs                                 - Allows returning/yielding new allocations from a block.
+        --allow-unknown-ops                                   - Allows unknown (not bufferizable) ops in the input IR.
+        --analysis-fuzzer-seed=<uint>                         - Test only: Analyze ops in random order with a given seed (fuzzer)
+        --analysis-heuristic=<string>                         - Heuristic that control the IR traversal during analysis
+        --bufferize-function-boundaries                       - Bufferize function boundaries (experimental).
+        --copy-before-write                                   - Skip the analysis. Make a buffer copy on every write.
+        --create-deallocs                                     - Specify if buffers should be deallocated. For compatibility with core bufferization passes.
+        --dialect-filter=<string>                             - Restrict bufferization to ops from these dialects.
+        --dump-alias-sets                                     - Test only: Annotate tensor IR with alias sets
+        --function-boundary-type-conversion=<string>          - Controls layout maps when bufferizing function signatures.
+        --must-infer-memory-space                             - The memory space of an memref types must always be inferred. If unset, a default memory space of 0 is used otherwise.
+        --no-analysis-func-filter=<string>                    - Skip analysis of functions with these symbol names.Set copyBeforeWrite to true when bufferizing them.
+        --print-conflicts                                     - Test only: Annotate IR with RaW conflicts. Requires test-analysis-only.
+        --test-analysis-only                                  - Test only: Only run inplaceability analysis and annotate IR
+        --unknown-type-conversion=<string>                    - Controls layout maps for non-inferrable memref types.
+      --outline-shape-computation                             -   Using shape.func to preserve shape computation
+      --post-sparsification-rewrite                           -   Applies sparse tensor rewriting rules after sparsification
+        --enable-convert                                      - Enable rewriting rules for the convert operator
+        --enable-foreach                                      - Enable rewriting rules for the foreach operator
+        --enable-runtime-library                              - Enable runtime library for manipulating sparse tensors
+      --pre-sparsification-rewrite                            -   Applies sparse tensor rewriting rules prior to sparsification
+      --print-ir                                              -   Print IR on the debug stream
+        --label=<string>                                      - Label
+      --print-op-stats                                        -   Print statistics of operations
+        --json                                                - print the stats as JSON
+      --promote-buffers-to-stack                              -   Promotes heap-based allocations to automatically managed stack-based allocations
+        --max-alloc-size-in-bytes=<uint>                      - Maximal size in bytes to promote allocations to stack.
+        --max-rank-of-allocated-memref=<uint>                 - Maximal memref rank to promote dynamic buffers.
+      --reconcile-unrealized-casts                            -   Simplify and eliminate unrealized conversion casts
+      --remove-dead-values                                    -   Remove dead values
+      --remove-shape-constraints                              -   Replace all cstr_ ops with a true witness
+      --resolve-ranked-shaped-type-result-dims                -   Resolve memref.dim of result values of ranked shape type
+      --resolve-shaped-type-result-dims                       -   Resolve memref.dim of result values
+      --rocdl-attach-target                                   -   Attaches a ROCDL target attribute to a GPU Module.
+        -O <uint>                                             - Optimization level.
+        --abi=<string>                                        - Optimization level.
+        --chip=<string>                                       - Target chip.
+        --correct-sqrt                                        - Enable correct rounded sqrt.
+        --daz                                                 - Enable denormals are zero opt.
+        --fast                                                - Enable fast relaxed math opt.
+        --features=<string>                                   - Target features.
+        --finite-only                                         - Enable finite only opt.
+        -l <string>                                           - Extra bitcode libraries paths to link to.
+        --module=<string>                                     - Regex used to identify the modules to attach the target to.
+        --triple=<string>                                     - Target triple.
+        --unsafe-math                                         - Enable unsafe math opt.
+        --wave64                                              - Use Wave64 mode.
+      --sccp                                                  -   Sparse Conditional Constant Propagation
+      --scf-bufferize                                         -   Bufferize the scf dialect.
+      --scf-for-loop-canonicalization                         -   Canonicalize operations within scf.for loop bodies
+      --scf-for-loop-peeling                                  -   Peel `for` loops at their upper bounds.
+        --skip-partial                                        - Do not peel loops inside of the last, partial iteration of another already peeled loop.
+      --scf-for-loop-range-folding                            -   Fold add/mul ops into loop range
+      --scf-for-loop-specialization                           -   Specialize `for` loops for vectorization
+      --scf-for-to-while                                      -   Convert SCF for loops to SCF while loops
+      --scf-parallel-loop-fusion                              -   Fuse adjacent parallel loops
+      --scf-parallel-loop-specialization                      -   Specialize parallel loops for vectorization
+      --scf-parallel-loop-tiling                              -   Tile parallel loops
+        --no-min-max-bounds                                   - Perform tiling with fixed upper bound with inbound check inside the internal loops
+        --parallel-loop-tile-sizes=<long>                     - Factors to tile parallel loops by
+      --shape-bufferize                                       -   Bufferize the shape dialect.
+      --shape-to-shape-lowering                               -   Legalize Shape dialect to be convertible to Arith
+      --slice-analysis-test                                   -   Test Slice analysis functionality.
+        --omit-block-arguments                                - Test Slice analysis with multiple blocks but slice omiting block arguments
+      --snapshot-op-locations                                 -   Generate new locations from the current IR
+        --filename=<string>                                   - The filename to print the generated IR
+        --tag=<string>                                        - A tag to use when fusing the new locations with the original. If unset, the locations are replaced.
+      --sparse-buffer-rewrite                                 -   Rewrite sparse primitives on buffers to actual code
+        --enable-buffer-initialization                        - Enable zero-initialization of the memory buffers
+      --sparse-gpu-codegen                                    -   Generates GPU code during sparsification
+        --num_threads=<int>                                   - Sets the number of GPU threads
+      --sparse-storage-specifier-to-llvm                      -   Lower sparse storage specifer to llvm structure
+      --sparse-tensor-codegen                                 -   Convert sparse tensors and primitives to actual code
+        --create-sparse-deallocs                              - Specify if the temporary buffers created by the sparse compiler should be deallocated. For compatibility with core bufferization passes. This option is only used when enable-runtime-library=false. See also create-deallocs for BufferizationOption.
+        --enable-buffer-initialization                        - Enable zero-initialization of the memory buffers
+      --sparse-tensor-conversion                              -   Convert sparse tensors and primitives to library calls
+        --s2s-strategy=<int>                                  - Set the strategy for sparse-to-sparse conversion
+      --sparse-vectorization                                  -   Vectorizes loops after sparsification
+        --enable-simd-index32                                 - Enable i32 indexing into vectors (for efficient gather/scatter)
+        --enable-vla-vectorization                            - Enable vector length agnostic vectorization
+        --vl=<int>                                            - Set the vector length (use 0 to disable vectorization)
+      --sparsification                                        -   Automatically generate sparse tensor code from sparse tensor types
+        --enable-gpu-libgen                                   - Enable GPU acceleration by means of direct library calls (like cuSPARSE)
+        --enable-index-reduction                              - Enable dependent index reduction based algorithm to handle non-trivial index expressions on sparse inputs (experimental features)
+        --enable-runtime-library                              - Enable runtime library for manipulating sparse tensors
+        --gpu-data-transfer-strategy=<value>                  - Set the data transfer strategy
+    =regular-dma                                        -   Default option: malloc on host without additional options or care and then use DMA to copy the data
+    =pinned-dma                                         -   Based on the default option, pin the host memory to accelerate the data transfer
+    =zero-copy                                          -   Use zero-copy to perform the data transfer from the host to the GPU
+        --parallelization-strategy=<value>                    - Set the parallelization strategy
+    =none                                               -   Turn off sparse parallelization.
+    =dense-outer-loop                                   -   Enable dense outer loop sparse parallelization.
+    =any-storage-outer-loop                             -   Enable sparse parallelization regardless of storage for the outer loop.
+    =dense-any-loop                                     -   Enable dense parallelization for any loop.
+    =any-storage-any-loop                               -   Enable sparse parallelization for any storage and loop.
+      --sparsification-and-bufferization                      -   Mini-pipeline that combines bufferization and sparsifiation
+      --spirv-canonicalize-gl                                 -   Canonicalize GLSL ops
+      --spirv-lower-abi-attrs                                 -   Decorate SPIR-V composite type with layout info
+      --spirv-rewrite-inserts                                 -   Rewrite sequential chains of `spirv.CompositeInsert` operations into `spirv.CompositeConstruct` operations
+      --spirv-unify-aliased-resource                          -   Unify access of multiple aliased resources into access of one single resource
+      --spirv-update-vce                                      -   Deduce and attach minimal (version, capabilities, extensions) requirements to spirv.module ops
+      --spirv-webgpu-prepare                                  -   Prepare SPIR-V to target WebGPU by expanding unsupported ops and replacing with supported ones
+      --sroa                                                  -   Scalar Replacement of Aggregates
+      --strip-debuginfo                                       -   Strip debug info from all operations
+      --symbol-dce                                            -   Eliminate dead symbols
+      --symbol-privatize                                      -   Mark symbols private
+        --exclude=<string>                                    - Comma separated list of symbols that should not be marked private
+      --tensor-bufferize                                      -   Bufferize the `tensor` dialect
+      --test-affine-data-copy                                 -   Tests affine data copy utility functions.
+        --for-memref-region                                   - Test copy generation for a single memref region
+        --memref-filter                                       - Enable memref filter testing in affine data copy optimization
+      --test-affine-loop-unswitch                             -   Tests affine loop unswitching / if/else hoisting
+      --test-affine-parametric-tile                           -   Tile affine loops using SSA values as tile sizes
+      --test-affine-reify-value-bounds                        -   Tests ValueBoundsOpInterface with affine dialect reification
+        --reify-to-func-args                                  - Reify in terms of function args
+        --use-arith-ops                                       - Reify with arith dialect ops
+      --test-alias-analysis                                   -   Test alias analysis results.
+      --test-alias-analysis-extending                         -   Test alias analysis extending.
+      --test-alias-analysis-modref                            -   Test alias analysis ModRef results.
+      --test-arith-emulate-wide-int                           -   Function pass to test Wide Integer Emulation
+        --function-prefix=<string>                            - Prefix of functions to run the emulation pass on
+        --widest-int-supported=<uint>                         - Maximum integer bit width supported by the target
+      --test-block-is-in-loop                                 -   Test mlir::blockIsInLoop()
+      --test-bytecode-callback                                -   Test encoding of a dialect type/attributes with a custom callback
+        --callback-test=<int>                                 - Specifies the test kind to execute
+        --test-dialect-version=<value>                        - Specifies the test dialect version to emit and parse
+      --test-cf-assert                                        -   Function pass to test cf.assert lowering to LLVM without abort
+      --test-cfg-loop-info                                    -   Test the loop info analysis.
+      --test-clone                                            -   Test clone of op
+      --test-commutativity-utils                              -   Test the functionality of the commutativity utility
+      --test-compose-subview                                  -   Test combining composed subviews
+      --test-constant-fold                                    -   Test operation constant folding
+      --test-control-flow-sink                                -   Test control-flow sink pass
+      --test-convert-call-op                                  -   Tests conversion of `func.call` to `llvm.call` in presence of custom types
+      --test-create-vector-broadcast                          -   Test optimization transformations for transfer ops
+      --test-data-layout-query                                -   Test data layout queries
+      --test-dead-code-analysis                               -   
+      --test-decompose-affine-ops                             -   Tests affine ops decomposition utility functions.
+      --test-decompose-call-graph-types                       -   Decomposes types at call graph boundaries.
+      --test-derived-attr                                     -   Run test derived attributes
+      --test-diagnostic-filter                                -   Test diagnostic filtering support.
+        --filters=<string>                                    - Specifies the diagnostic file name filters.
+      --test-dialect-conversion-pdll                          -   Test DialectConversion PDLL functionality
+      --test-distinct-attrs                                   -   Test parallel creation of distinct attributes
+      --test-dynamic-pipeline                                 -   Tests the dynamic pipeline feature by applying a pipeline on a selected set of functions
+        --dynamic-pipeline=<string>                           - The pipeline description that will run on the filtered function.
+        --op-name=<string>                                    - List of function name to apply the pipeline to
+        --run-on-nested-operations                            - This will apply the pipeline on nested operations under the visited operation.
+        --run-on-parent                                       - This will apply the pipeline on the parent operation if it exist, this is expected to fail.
+      --test-elements-attr-interface                          -   Test ElementsAttr interface support.
+      --test-emulate-narrow-int                               -   Function pass to test Narrow Integer Emulation
+        --arith-compute-bitwidth=<uint>                       - arith computation bit width
+        --memref-load-bitwidth=<uint>                         - memref load/store emulation bit width
+      --test-expand-math                                      -   Test expanding math
+      --test-extract-fixed-outer-loops                        -   test application of parametric tiling to the outer loops so that the ranges of outer loops become static
+        --test-outer-loop-sizes=<long>                        - fixed number of iterations that the outer loops should have
+      --test-fold-arith-extf-into-vector-contract-patterns    -   Test patterns that fold arithmetic extension ops into vector contract ops
+      --test-foo-analysis                                     -   
+      --test-func-erase-arg                                   -   Test erasing func args.
+      --test-func-erase-result                                -   Test erasing func results.
+      --test-func-insert-arg                                  -   Test inserting func args.
+      --test-func-insert-result                               -   Test inserting func results.
+      --test-func-set-type                                    -   Test FunctionOpInterface::setType.
+      --test-function-pass                                    -   Test a function pass in the pass manager
+      --test-generic-ir-block-visitors-interrupt              -   Test generic IR visitors with interrupts, starting with Blocks.
+      --test-generic-ir-region-visitors-interrupt             -   Test generic IR visitors with interrupts, starting with Regions.
+      --test-generic-ir-visitors                              -   Test generic IR visitors.
+      --test-generic-ir-visitors-interrupt                    -   Test generic IR visitors with interrupts.
+      --test-gpu-memory-promotion                             -   Promotes the annotated arguments of gpu.func to workgroup memory.
+      --test-gpu-rewrite                                      -   Applies all rewrite patterns within the GPU dialect.
+      --test-inline                                           -   Test inlining region calls
+      --test-int-range-inference                              -   Test integer range inference analysis
+      --test-interface-pass                                   -   Test an interface pass (running on FunctionOpInterface) in the pass manager
+      --test-ir-visitors                                      -   Test various visitors.
+      --test-last-modified                                    -   
+      --test-lazy-loading                                     -   Test LazyLoading of op
+        --bytecode-version=<int>                              - Specifies the bytecode version to use.
+      --test-legalize-patterns                                -   Run test dialect legalization patterns
+      --test-legalize-type-conversion                         -   Test various type conversion functionalities in DialectConversion
+      --test-legalize-unknown-root-patterns                   -   Test public remapped value mechanism in ConversionPatternRewriter
+      --test-linalg-data-layout-propagation                   -   Test data layout propagation
+      --test-linalg-decompose-ops                             -   Test Linalg decomposition patterns
+        --remove-dead-args-and-results                        - Test patterns to erase unused operands and results
+      --test-linalg-drop-unit-dims                            -   
+      --test-linalg-elementwise-fusion-patterns               -   Test Linalg element wise operation fusion patterns
+        --collapse-dimensions-control=<long>                  - Test controlling dimension collapse pattern
+        --control-fusion-by-expansion                         - Test controlling fusion of reshape with generic op by expansion
+        --fuse-generic-ops                                    - Test fusion of generic operations.
+        --fuse-generic-ops-control                            - Test fusion of generic operations with a control function.
+        --fuse-multiuse-producer                              - Test fusion of producer ops with multiple uses
+        --fuse-with-reshape-by-collapsing                     - Test linalg expand_shape -> generic fusion patterns that collapse the iteration space of the consumer
+        --fuse-with-reshape-by-collapsing-control             - Test controlling the linalg expand_shape -> generic fusion patterns that collapse the iteration space of the consumer
+        --fuse-with-reshape-by-expansion                      - Test fusion of generic operations with reshape by expansion
+      --test-linalg-greedy-fusion                             -   Test Linalg fusion by applying a greedy test transformation.
+      --test-linalg-pad-fusion                                -   Test PadOp fusion
+      --test-linalg-transform-patterns                        -   Test Linalg transformation patterns by applying them greedily.
+        --loop-type=<string>                                  - Specify the type of loops to generate: for, parallel or tiled_loop
+        --peeled-loops=<long>                                 - Loops to be peeled when test-tile-pattern
+        --skip-partial                                        - Skip loops inside partial iterations during peeling
+        --test-bubble-up-extract-slice-op-pattern             - Test rewrite of linalgOp + extract_slice into extract_slice + linalgOp
+        --test-erase-unnecessary-inputs                       - Test patterns to erase unnecessary inputs
+        --test-erase-unused-operands-and-results              - Test patterns to erase unused operands and results
+        --test-generalize-pad-tensor                          - Test transform pad tensor by copying with generic ops
+        --test-generalize-tensor-pack                         - Test transform that generalizes pack ops into a sequence of tensor and Linalg ops
+        --test-generalize-tensor-unpack                       - Test transform that generalizes unpack ops into a sequence of tensor and Linalg ops
+        --test-linalg-to-vector-patterns                      - Test a set of patterns that rewrite a linalg contraction in vector.contract form
+        --test-patterns                                       - Test a mixed set of patterns
+        --test-swap-extract-slice-with-fill-pattern           - Test patterns to swap tensor.extract_slice(linalg.fill())
+        --test-swap-subtensor-padtensor                       - Test rewrite of subtensor(tensor.pad) into tensor.pad(subtensor)
+        --test-vector-transfer-forwarding-patterns            - Test a fused pass that forwards memref.copy to vector.transfer
+        --tile-sizes=<long>                                   - Linalg tile sizes for test-tile-pattern
+      --test-liveness-analysis                                -   
+      --test-loop-fusion                                      -   Tests loop fusion utility functions.
+      --test-loop-permutation                                 -   Tests affine loop permutation utility
+        --permutation-map=<uint>                              - Specify the loop permutation
+      --test-loop-unrolling                                   -   Tests loop unrolling transformation
+        --annotate                                            - Annotate unrolled iterations.
+        --loop-depth=<uint>                                   - Loop depth.
+        --unroll-factor=<ulong>                               - Loop unroll factor.
+        --unroll-up-to-factor                                 - Loop unroll up to factor.
+      --test-make-isolated-from-above                         -   Test making a region isolated from above
+        --clone-ops-with-no-operands                          - Test case with cloning of operations with no operands
+        --clone-ops-with-operands                             - Test case with cloning of operations with no operands
+        --simple                                              - Test simple case with no cloning of operations
+      --test-mapping-to-processing-elements                   -   test mapping a single loop on a virtual processor grid
+      --test-match-reduction                                  -   Test the match reduction utility.
+      --test-matchers                                         -   Test C++ pattern matchers.
+      --test-math-algebraic-simplification                    -   Test math algebraic simplification
+      --test-math-polynomial-approximation                    -   Test math polynomial approximations
+        --enable-avx2                                         - Enable approximations that emit AVX2 intrinsics via the X86Vector dialect
+      --test-memref-bound-check                               -   Check memref access bounds
+      --test-memref-dependence-check                          -   Checks dependences between all pairs of memref accesses.
+      --test-memref-stride-calculation                        -   Test operation constant folding
+      --test-merge-blocks                                     -   Test Merging operation in ConversionPatternRewriter
+      --test-mlir-reducer                                     -   Tests MLIR Reduce tool by generating failures
+      --test-module-pass                                      -   Test a module pass in the pass manager
+      --test-multi-buffering                                  -   Test multi buffering transformation
+        --multiplier=<uint>                                   - Decide how many versions of the buffer should be created,
+      --test-next-access                                      -   
+      --test-nvgpu-mmasync-f32-to-tf32-patterns               -   Test patterns to convert mma.sync on f32 with tf32 precision
+        --precision=<string>                                  - Target nvgpu.mma.sync on f32 input with tf32 or tf32x3 precision
+      --test-one-to-n-type-conversion                         -   Test pass for 1:N type conversion
+        --convert-func-ops                                    - Enable conversion on func ops
+        --convert-scf-ops                                     - Enable conversion on scf ops
+        --convert-tuple-ops                                   - Enable conversion on tuple ops
+      --test-opaque-loc                                       -   Changes all leaf locations to opaque locations
+      --test-operations-equality                              -   Test operations equality.
+      --test-options-pass                                     -   Test options parsing capabilities
+        --list=<int>                                          - Example list option
+        --string=<string>                                     - Example string option
+        --string-list=<string>                                - Example string list option
+      --test-pass-crash                                       -   Test a pass in the pass manager that always crashes
+      --test-pass-create-invalid-ir                           -   Test pass that adds an invalid operation in a function body
+        --emit-invalid-ir                                     - Emit invalid IR
+        --signal-pass-failure                                 - Trigger a pass failure
+      --test-pass-failure                                     -   Test a pass in the pass manager that always fails
+      --test-pass-invalid-parent                              -   Test a pass in the pass manager that makes the parent operation invalid
+      --test-pattern-selective-replacement                    -   Test selective replacement in the PatternRewriter
+      --test-patterns                                         -   Run test dialect patterns
+        --max-iterations=<int>                                - Max. iterations in the GreedyRewriteConfig
+        --top-down                                            - Seed the worklist in general top-down order
+      --test-pdl-bytecode-pass                                -   Test PDL ByteCode functionality
+      --test-pdll-pass                                        -   Test PDLL functionality
+      --test-print-callgraph                                  -   Print the contents of a constructed callgraph.
+      --test-print-defuse                                     -   Test various printing.
+      --test-print-dominance                                  -   Print the dominance information for multiple regions.
+      --test-print-invalid                                    -   Test printing invalid ops.
+      --test-print-liveness                                   -   Print the contents of a constructed liveness information.
+      --test-print-nesting                                    -   Test various printing.
+      --test-print-shape-mapping                              -   Print the contents of a constructed shape mapping information.
+      --test-print-topological-sort                           -   Print operations in topological order
+      --test-recursive-types                                  -   Test support for recursive types
+      --test-remapped-value                                   -   Test public remapped value mechanism in ConversionPatternRewriter
+      --test-return-type                                      -   Run return type functions
+      --test-rewrite-dynamic-op                               -   Test rewritting on dynamic operations
+      --test-scalar-vector-transfer-lowering                  -   Test lowering of scalar vector transfers to memref loads/stores.
+        --allow-multiple-uses                                 - Fold transfer operations with multiple uses
+      --test-scf-for-utils                                    -   test scf.for utils
+        --test-replace-with-new-yields                        - Test replacing a loop with a new loop that returns new additional yield values
+      --test-scf-if-utils                                     -   test scf.if utils
+      --test-scf-parallel-loop-collapsing                     -   Test parallel loops collapsing transformation
+        --collapsed-indices-0=<uint>                          - Which loop indices to combine 0th loop index
+        --collapsed-indices-1=<uint>                          - Which loop indices to combine into the position 1 loop index
+        --collapsed-indices-2=<uint>                          - Which loop indices to combine into the position 2 loop index
+      --test-scf-pipelining                                   -   test scf.forOp pipelining
+        --annotate                                            - Annote operations during loop pipelining transformation
+        --no-epilogue-peeling                                 - Use predicates instead of peeling the epilogue.
+      --test-scf-while-op-builder                             -   test build functions of scf.while
+      --test-shape-function-report                            -   Test pass to report associated shape functions
+      --test-side-effects                                     -   Test side effects interfaces
+      --test-sink-vector-broadcast                            -   Test lowering patterns that eliminate redundant brodacast operations.
+      --test-spirv-entry-point-abi                            -   Set the spirv.entry_point_abi attribute on GPU kernel function within the module, intended for testing only
+        --workgroup-size=<int>                                - Workgroup size to use for all gpu.func kernels in the module, specified with x-dimension first, y-dimension next and z-dimension last. Unspecified dimensions will be set to 1
+      --test-spirv-module-combiner                            -   Tests SPIR-V module combiner library
+      --test-spirv-op-availability                            -   Test SPIR-V op availability
+      --test-spirv-target-env                                 -   Test SPIR-V target environment
+      --test-stats-pass                                       -   Test pass statistics
+      --test-strict-pattern-driver                            -   Test strict mode of pattern driver
+        --strictness=<string>                                 - Can be {AnyOp, ExistingAndNewOps, ExistingOps}
+      --test-symbol-rauw                                      -   Test replacement of symbol uses
+      --test-symbol-uses                                      -   Test detection of symbol uses
+      --test-take-body                                        -   Test Region's takeBody
+      --test-target-materialization-with-no-uses              -   Test a special case of target materialization in DialectConversion
+      --test-tensor-copy-insertion                            -   Module pass to test Tensor Copy Insertion
+        --allow-return-allocs                                 - Allows returning/yielding new allocations from a block.
+        --bufferize-function-boundaries                       - Bufferize function boundaries.
+        --create-deallocs                                     - Specify if new allocations should be deallocated.
+        --must-infer-memory-space                             - The memory space of an memref types must always be inferred. If unset, a default memory space of 0 is used otherwise.
+      --test-tensor-transform-patterns                        -   Test Tensor transformation patterns by applying them greedily.
+        --test-drop-redundant-insert-slice-rank-expansion     - Test dropping redundant insert_slice rank expansions
+        --test-fold-consecutive-insert-extract-slice          - Test folding consecutive tensor.insert_slice/tensor.extract_slice
+        --test-fold-constant-extract-slice                    - Test folding arith.constant and tensor.extract_slice
+        --test-fold-into-pack-and-unpack                      - Test folding ops into tensor.pack and tensor.unpack
+        --test-reassociative-reshape-folding                  - Test folding of expand_shape/collapse_shape
+        --test-rewrite-extract-slice-from-collapse-shape      - Test swapping tensor.extract_slice of a collapse_shape with loop nest
+        --test-simplify-pack-patterns                         - Test patterns to simplify tensor.pack
+        --test-tracking-listener                              - Test tensor TrackingListener for the transform dialect
+        --use-foreach                                         - Use the scf.forall operation when generating loop nests for the extract_slice of collapse_shape pattern
+      --test-tiling-interface                                 -   Test tiling using TilingInterface
+        --lower-to-scalar-using-scf-for                       - Test lowering to scalar implementation using TilingInterface with scf.for operations
+        --tile-consumer-and-fuse-producer-using-scf-for       - Test tile and fuse transformation using TilingInterface with scf.for operations
+        --tile-consumer-fuse-and-yield-producer-using-scf-for - Test tile and fuse transformation while yielding fused producer replacements using TilingInterface with scf.for operations
+        --tile-using-scf-for                                  - Test tiling using TilingInterface with scf.for operations
+      --test-topological-sort-analysis                        -   Test topological sorting of ops
+      --test-trait-folder                                     -   Run trait folding
+      --test-transform-dialect-erase-schedule                 -   erase transform dialect schedule from the IR
+      --test-transform-dialect-interpreter                    -   apply transform dialect operations one by one
+        --bind-first-extra-to-ops=<string>                    - bind the first extra argument of the top-level op to payload operations of the given kind
+        --bind-first-extra-to-params=<int>                    - bind the first extra argument of the top-level op to the given integer parameters
+        --bind-first-extra-to-results-of-ops=<string>         - bind the first extra argument of the top-level op to results of payload operations of the given kind
+        --bind-second-extra-to-ops=<string>                   - bind the second extra argument of the top-level op to payload operations of the given kind
+        --bind-second-extra-to-params=<int>                   - bind the second extra argument of the top-level op to the given integer parameters
+        --bind-second-extra-to-results-of-ops=<string>        - bind the second extra argument of the top-level op to results of payload operations of the given kind
+        --debug-payload-root-tag=<string>                     - Select the operation with 'transform.target_tag' attribute having the given value as payload IR root. If empty select the pass anchor operation as the payload IR root.
+        --debug-transform-root-tag=<string>                   - Select the operation with 'transform.target_tag' attribute having the given value as container IR for top-level transform ops. This allows user control on what transformation to apply. If empty, select the container of the top-level transform op.
+        --enable-expensive-checks                             - perform expensive checks to better report errors in the transform IR
+        --test-module-generation                              - test the generation of the transform module during pass initialization, overridden by parsing
+        --transform-file-name=<string>                        - Optional filename containing a transform dialect specification to apply. If left empty, the IR is assumed to contain one top-level transform dialect operation somewhere in the module.
+        --transform-library-file-name=<string>                - Optional name of the file containing transform dialect symbol definitions to be injected into the transform module.
+      --test-type-interfaces                                  -   Test type interface support.
+      --test-vector-break-down-bitcast                        -   Test pattern that breaks down vector.bitcast ops 
+      --test-vector-contraction-prepare-for-mmt-lowering      -   Test vector.contraction matmul canonicalization for MMT lowering.
+      --test-vector-extract-strided-slice-lowering            -   Test lowering patterns that converts vector.extract_strided_slice into a chain of vector.extract and vector.insert ops
+      --test-vector-gather-lowering                           -   Test patterns that lower the gather op in the vector conditional loads
+      --test-vector-reduction-to-contract-patterns            -   Test patterns to convert multireduce op to contract and combine broadcast/transpose to contract
+      --test-vector-reduction-to-spirv-dot-prod               -   Test lowering patterns that converts vector.reduction to SPIR-V integer dot product ops
+      --test-vector-scan-lowering                             -   Test lowering patterns that lower the scan op in the vector dialect
+      --test-vector-to-vector-lowering                        -   Test lowering patterns between ops in the vector dialect
+        --unroll                                              - Include unrolling
+      --test-vector-transfer-collapse-inner-most-dims         -   Test lowering patterns that reducedes the rank of the vector transfer memory and vector operands.
+      --test-vector-transfer-flatten-patterns                 -   Test patterns to rewrite contiguous row-major N-dimensional vector.transfer_{read,write} ops into 1D transfers
+      --test-vector-transfer-unrolling-patterns               -   Test lowering patterns to unroll transfer ops in the vector dialect
+        --reverse-unroll-order                                - reverse the order of unrolling of vector transfer operations
+      --test-vector-transferop-opt                            -   Test optimization transformations for transfer ops
+      --test-vector-unrolling-patterns                        -   Test lowering patterns to unroll contract ops in the vector dialect
+        --unroll-based-on-type                                - Set the unroll factor based on type of the operation
+        --unroll-order=<long>                                 - set the unroll order
+      --test-vector-warp-distribute                           -   Test vector warp distribute transformation and lowering patterns
+        --distribute-transfer-write                           - Test distribution of transfer write
+        --hoist-uniform                                       - Test hoist uniform
+        --propagate-distribution                              - Test distribution propgation
+        --rewrite-warp-ops-to-scf-if                          - Lower vector.warp_execute_on_lane0 to scf.if op
+      --test-verify-uselistorder                              -   Verify that roundtripping the IR to bytecode preserves the order of the uselists
+        --rng-seed=<uint>                                     - Specify an input random seed
+      --test-written-to                                       -   
+      --topological-sort                                      -   Sort regions without SSA dominance in topological order
+      --tosa-infer-shapes                                     -   Propagate shapes across TOSA operations
+      --tosa-layerwise-constant-fold                          -   Fold layerwise operations on constant tensors
+      --tosa-make-broadcastable                               -   TOSA rank Reshape to enable Broadcasting
+      --tosa-optional-decompositions                          -   Applies Tosa operations optional decompositions
+      --tosa-test-quant-utils                                 -   TOSA Test: Exercise the APIs in QuantUtils.cpp.
+      --tosa-to-arith                                         -   Lower TOSA to the Arith dialect
+        --include-apply-rescale                               - Whether to include the lowering for tosa.apply_rescale to arith
+        --use-32-bit                                          - Whether to prioritze lowering to 32-bit operations
+      --tosa-to-linalg                                        -   Lower TOSA to LinAlg on tensors
+      --tosa-to-linalg-named                                  -   Lower TOSA to LinAlg named operations
+      --tosa-to-scf                                           -   Lower TOSA to the SCF dialect
+      --tosa-to-tensor                                        -   Lower TOSA to the Tensor dialect
+      --tosa-validate                                         -   Validates TOSA dialect
+        --level=<value>                                       - Validate if operator parameters are within specfication for the given level
+    =8k                                                 -   Ranges are expected to be sufficient for applications with frame sizes up to 8K.
+    =none                                               -   Allows the full range of arguments specified by the operations according to the operation data types.
+        --profile=<value>                                     - Validate if operations match for the given profile
+    =bi                                                 -   Use Base Inference profile.
+    =mi                                                 -   Use Main Inference profile.
+    =mt                                                 -   Use Main Training profile.
+    =undefined                                          -   Do not define a profile.
+        --strict-op-spec-alignment                            - Verify if the properties of certain operations align the spec requirement
+      --transform-dialect-check-uses                          -   warn about potential use-after-free in the transform dialect
+      --transform-infer-effects                               -   infer transform side effects for symbols
+      --vector-bufferize                                      -   Bufferize Vector dialect ops
+      --view-op-graph                                         -   Print Graphviz visualization of an operation
+        --max-label-len=<uint>                                - Limit attribute/type length to number of chars
+        --print-attrs                                         - Print attributes of operations
+        --print-control-flow-edges                            - Print control flow edges
+        --print-data-flow-edges                               - Print data flow edges
+        --print-result-types                                  - Print result types of operations
+    Pass Pipelines:
+      --sparse-compiler                                       -   The standard pipeline for taking sparsity-agnostic IR using the sparse-tensor type, and lowering it to LLVM IR with concrete representations and algorithms for sparse tensors.
+        --create-sparse-deallocs                              - Specify if the temporary buffers created by the sparse compiler should be deallocated. For compatibility with core bufferization passes. This option is only used when enable-runtime-library=false. See also create-deallocs for BufferizationOption.
+        --enable-amx                                          - Enables the use of AMX dialect while lowering the vector dialect
+        --enable-arm-neon                                     - Enables the use of ArmNeon dialect while lowering the vector dialect
+        --enable-arm-sve                                      - Enables the use of ArmSVE dialect while lowering the vector dialect
+        --enable-buffer-initialization                        - Enable zero-initialization of memory buffers
+        --enable-gpu-libgen                                   - Enables GPU acceleration by means of direct library calls (like cuSPARSE)
+        --enable-index-optimizations                          - Allows compiler to assume indices fit in 32-bit if that yields faster code
+        --enable-index-reduction                              - Enable dependent index reduction based algorithm to handle non-trivial index expressions on sparse inputs (experimental features)
+        --enable-runtime-library                              - Enable runtime library for manipulating sparse tensors
+        --enable-x86vector                                    - Enables the use of X86Vector dialect while lowering the vector dialect
+        --gpu-chip=<string>                                   - GPU target architecture
+        --gpu-data-transfer-strategy=<value>                  - Set the data transfer strategy between the host and the GPUs
+    =regular-dma                                        -   Default option: malloc on host without additional options or care and then use DMA to copy the data
+    =pinned-dma                                         -   Based on the default option, pin the host memory to accelerate the data transfer
+    =zero-copy                                          -   Use zero-copy to perform the data transfer from the host to the GPU
+        --gpu-features=<string>                               - GPU target features
+        --gpu-triple=<string>                                 - GPU target triple
+        --parallelization-strategy=<value>                    - Set the parallelization strategy
+    =none                                               -   Turn off sparse parallelization.
+    =dense-outer-loop                                   -   Enable dense outer loop sparse parallelization.
+    =any-storage-outer-loop                             -   Enable sparse parallelization regardless of storage for the outer loop.
+    =dense-any-loop                                     -   Enable dense parallelization for any loop.
+    =any-storage-any-loop                               -   Enable sparse parallelization for any storage and loop.
+        --reassociate-fp-reductions                           - Allows llvm to reassociate floating-point reductions for speed
+        --s2s-strategy=<int>                                  - Set the strategy for sparse-to-sparse conversion
+        --test-bufferization-analysis-only                    - Run only the inplacability analysis
+        --vl=<int>                                            - Set the vector length (0 disables vectorization)
+      --test-lower-to-llvm                                    -   An example of pipeline to lower the main dialects (arith, linalg, memref, scf, vector) down to LLVM.
+        --reassociate-fp-reductions                           - Allow reassociation og FP reductions
+      --test-options-pass-pipeline                            -   Parses options using pass pipeline registration
+        --list=<int>                                          - Example list option
+        --string=<string>                                     - Example string option
+        --string-list=<string>                                - Example string list option
+      --test-pm-nested-pipeline                               -   Test a nested pipeline in the pass manager
+      --test-textual-pm-nested-pipeline                       -   Test a nested pipeline in the pass manager
+  --test-legalize-mode=<value>                                - The legalization mode to use with the test driver
+    =analysis                                                 -   Perform an analysis conversion
+    =full                                                     -   Perform a full conversion
+    =partial                                                  -   Perform a partial conversion
+  --type-based-intrinsic-cost                                 - Calculate intrinsics cost based only on argument types
+  --verify-diagnostics                                        - Check that emitted diagnostics match expected-* lines on the corresponding line
+  --verify-each                                               - Run the verifier after each transformation pass
+  --verify-region-info                                        - Verify region info (time consuming)
+  --verify-roundtrip                                          - Round-trip the IR after parsing and ensure it succeeds
+  --vp-counters-per-site=<number>                             - The average number of profile counters allocated per value profiling site.
+  --vp-static-alloc                                           - Do static counter allocation for value profiler
+
+Generic Options:
+
+  --help                                                      - Display available options (--help-hidden for more)
+  --help-list                                                 - Display list of available options (--help-list-hidden for more)
+  --version                                                   - Display the version of this program
+
+affine-super-vectorizer-test options:
+
+  --backward-slicing                                          - Enable testing backward static slicing and topological sort functionalities
+  --compose-maps                                              - Enable testing the composition of AffineMap where each AffineMap in the composition is specified as the affine_map attribute in a constant op.
+  --forward-slicing                                           - Enable testing forward static slicing and topological sort functionalities
+  --slicing                                                   - Enable testing static slicing and topological sort functionalities
+  --vector-shape-ratio=<int>                                  - Specify the HW vector size for vectorization
+  --vectorize-affine-loop-nest                                - Enable testing for the 'vectorizeAffineLoopNest' utility by vectorizing the outermost loops found
+
+test-loop-fusion options:
+
+  --test-loop-fusion-dependence-check                         - Enable testing of loop fusion dependence check
+  --test-loop-fusion-slice-computation                        - Enable testing of loop fusion slice computation
+  --test-loop-fusion-transformation                           - Enable testing of loop fusion transformation
diff --git a/mlir_issue/temp.mlir b/mlir_issue/temp.mlir
new file mode 100644
index 000000000000000..7cf369230ec24a4
--- /dev/null
+++ b/mlir_issue/temp.mlir
@@ -0,0 +1,17 @@
+  func.func private @func1(%arg0: index) -> memref<11x5xf32> {
+    %cst_1 = arith.constant 0x4DAB5ADE : f32
+    %cst_2 = arith.constant 1.840000e+04 : f16
+    %false = arith.constant false
+    %c7 = arith.constant 7 : index
+    %c8 = arith.constant 8 : index
+    %c15 = arith.constant 15 : index
+    %c16 = arith.constant 16 : index
+    %c26 = arith.constant 26 : index
+    %7 = tensor.empty(%c7, %c16) : tensor<?x?x28xi16>
+    %124 = affine.for %arg1 = 0 to 92 iter_args(%arg2 = %7) -> (tensor<?x?x28xi16>) {
+      %325 = tensor.empty(%c26, %c15) : tensor<?x?x28xi16>
+      affine.yield %325 : tensor<?x?x28xi16>
+    }
+    %alloc_34 = memref.alloc() : memref<11x5xf32>
+    return %alloc_34 : memref<11x5xf32>
+  } 
\ No newline at end of file