[Mlir-commits] [llvm] [mlir] [tosa-fuser] Affine Fusion Pass (PR #107383)

Thu Sep 5 04:33:17 PDT 2024

https://github.com/asr-compiler created https://github.com/llvm/llvm-project/pull/107383

None

>From 268eb827ab3a3fddcd284686cd83ef8fee253b6e Mon Sep 17 00:00:00 2001
From: Akshar S Ramesh <aksharctt at gmail.com>
Date: Wed, 4 Sep 2024 12:31:57 +0530
Subject: [PATCH 1/2] Add tosa-fuser-opt tool

---
 mlir/include/mlir/Registration/Pipelines.h   |  11 +
 mlir/lib/CMakeLists.txt                      |   1 +
 mlir/lib/Registration/CMakeLists.txt         |  13 +
 mlir/lib/Registration/Pipelines.cpp          |  54 ++++
 mlir/tools/CMakeLists.txt                    |   1 +
 mlir/tools/tosa-fuser-opt/CMakeLists.txt     | 111 +++++++
 mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp | 321 +++++++++++++++++++
 7 files changed, 512 insertions(+)
 create mode 100644 mlir/include/mlir/Registration/Pipelines.h
 create mode 100644 mlir/lib/Registration/CMakeLists.txt
 create mode 100644 mlir/lib/Registration/Pipelines.cpp
 create mode 100644 mlir/tools/tosa-fuser-opt/CMakeLists.txt
 create mode 100644 mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp

diff --git a/mlir/include/mlir/Registration/Pipelines.h b/mlir/include/mlir/Registration/Pipelines.h
new file mode 100644
index 00000000000000..7a063b354bd862
--- /dev/null
+++ b/mlir/include/mlir/Registration/Pipelines.h
@@ -0,0 +1,11 @@
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include <cstdint>
+
+namespace mlir {
+    struct TosaFuserPipelineOptions : public PassPipelineOptions<TosaFuserPipelineOptions> {
+
+    };
+    void createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOptions &options, unsigned optLevel);
+    void registerTosaFuserPipeline();
+}
\ No newline at end of file
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
index d25c84a3975db4..435c1dfd0eb97d 100644
--- a/mlir/lib/CMakeLists.txt
+++ b/mlir/lib/CMakeLists.txt
@@ -20,3 +20,4 @@ add_subdirectory(Target)
 add_subdirectory(Tools)
 add_subdirectory(Transforms)
 add_subdirectory(ExecutionEngine)
+add_subdirectory(Registration)
diff --git a/mlir/lib/Registration/CMakeLists.txt b/mlir/lib/Registration/CMakeLists.txt
new file mode 100644
index 00000000000000..4d08e615c062d7
--- /dev/null
+++ b/mlir/lib/Registration/CMakeLists.txt
@@ -0,0 +1,13 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+
+add_mlir_library(TosaFuserRegistration
+    Pipelines.cpp
+
+    LINK_LIBS PUBLIC
+    ${dialect_libs}
+    ${conversion_libs}
+    MLIRPass
+    MLIRTransforms
+    MLIRGPUTransforms
+)
\ No newline at end of file
diff --git a/mlir/lib/Registration/Pipelines.cpp b/mlir/lib/Registration/Pipelines.cpp
new file mode 100644
index 00000000000000..43095b154e0ef9
--- /dev/null
+++ b/mlir/lib/Registration/Pipelines.cpp
@@ -0,0 +1,54 @@
+#include "mlir/Registration/Pipelines.h"
+#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
+#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
+#include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
+#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h"
+
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
+#include <optional>
+
+using namespace mlir;
+
+void mlir::createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOptions &options,
+                                   unsigned optLevel) {
+    pm.addNestedPass<func::FuncOp>(tosa::createTosaToLinalg());
+    pm.addPass(bufferization::createEmptyTensorEliminationPass());
+    pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
+    pm.addPass(bufferization::createOneShotBufferizePass());
+    pm.addPass(createCanonicalizerPass());
+    pm.addPass(createConvertLinalgToAffineLoopsPass());
+}
+
+static void tosaFuser3(OpPassManager &pm, const TosaFuserPipelineOptions &options) {
+    createTosaFuserPipeline(pm, options, 3);
+}
+
+void mlir::registerTosaFuserPipeline () {
+    static bool init_once = []() {
+        PassPipelineRegistration<TosaFuserPipelineOptions>(
+            "O3", "Tosa-Fuser Pipeline O3", tosaFuser3);
+        return true;
+    }();
+}
\ No newline at end of file
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
index 9b474385fdae18..01d80f5743fdeb 100644
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(mlir-translate)
 add_subdirectory(mlir-vulkan-runner)
 add_subdirectory(tblgen-lsp-server)
 add_subdirectory(tblgen-to-irdl)
+add_subdirectory(tosa-fuser-opt)
 
 # mlir-cpu-runner requires ExecutionEngine.
 if(MLIR_ENABLE_EXECUTION_ENGINE)
diff --git a/mlir/tools/tosa-fuser-opt/CMakeLists.txt b/mlir/tools/tosa-fuser-opt/CMakeLists.txt
new file mode 100644
index 00000000000000..965cc0138e5b4e
--- /dev/null
+++ b/mlir/tools/tosa-fuser-opt/CMakeLists.txt
@@ -0,0 +1,111 @@
+set(LLVM_OPTIONAL_SOURCES
+  null.cpp
+)
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+set(LLVM_LINK_COMPONENTS
+  Core
+  Support
+  AsmParser
+  )
+
+if(MLIR_INCLUDE_TESTS)
+  set(test_libs
+    ${cuda_test_libs}
+    MLIRTestFuncToLLVM
+    MLIRAffineTransformsTestPasses
+    MLIRArithTestPasses
+    MLIRArmNeonTestPasses
+    MLIRArmSMETestPasses
+    MLIRBufferizationTestPasses
+    MLIRControlFlowTestPasses
+    MLIRDLTITestPasses
+    MLIRFuncTestPasses
+    MLIRGPUTestPasses
+    MLIRLinalgTestPasses
+    MLIRLoopLikeInterfaceTestPasses
+    MLIRMathTestPasses
+    MLIRTestMathToVCIX
+    MLIRMemRefTestPasses
+    MLIRMeshTest
+    MLIRNVGPUTestPasses
+    MLIRSCFTestPasses
+    MLIRShapeTestPasses
+    MLIRSPIRVTestPasses
+    MLIRTensorTestPasses
+    MLIRTestAnalysis
+    MLIRTestConvertToSPIRV
+    MLIRTestDialect
+    MLIRTestDynDialect
+    MLIRTestIR
+    MLIRTestOneToNTypeConversionPass
+    MLIRTestPass
+    MLIRTestReducer
+    MLIRTestTransforms
+    MLIRTilingInterfaceTestPasses
+    MLIRVectorTestPasses
+    MLIRTestVectorToSPIRV
+    MLIRLLVMTestPasses
+    )
+  set(test_libs ${test_libs}
+    MLIRTestPDLL
+    MLIRTestTransformDialect
+    )
+
+  if (MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+    set(test_libs ${test_libs}
+      MLIRTestPDLL
+      MLIRTestRewrite
+      )
+  endif()
+endif()
+
+set(LIBS
+  ${dialect_libs}
+  ${conversion_libs}
+  ${extension_libs}
+  ${test_libs}
+
+  MLIRAffineAnalysis
+  MLIRAnalysis
+  MLIRCastInterfaces
+  MLIRDialect
+  MLIROptLib
+  MLIRParser
+  MLIRPass
+  MLIRTransforms
+  MLIRTransformUtils
+  MLIRSupport
+  MLIRIR
+  TosaFuserRegistration
+
+  # TODO: Remove when registerAllGPUToLLVMIRTranslations is no longer
+  # registered directly in tosa-fuser-opt.cpp.
+  MLIRToLLVMIRTranslationRegistration
+  )
+
+# Exclude from libMLIR.so because this has static options intended for
+# opt-like tools only.
+add_mlir_library(MLIRTosaFuserOptMain
+  tosa-fuser-opt.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+
+  LINK_LIBS PUBLIC
+  ${LIBS}
+  )
+
+add_mlir_tool(tosa-fuser-opt
+  tosa-fuser-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+  SUPPORT_PLUGINS
+  )
+target_link_libraries(tosa-fuser-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(tosa-fuser-opt)
+
+mlir_check_all_link_libraries(tosa-fuser-opt)
+export_executable_symbols_for_plugins(tosa-fuser-opt)
diff --git a/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp b/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp
new file mode 100644
index 00000000000000..95a59f44039699
--- /dev/null
+++ b/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp
@@ -0,0 +1,321 @@
+//===- tosa-fuser-opt.cpp - MLIR Optimizer Driver -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for tosa-fuser-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Config/mlir-config.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Registration/Pipelines.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR/Dialect/All.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+using namespace mlir;
+
+// Defined in the test directory, no public header.
+namespace mlir {
+void registerCloneTestPasses();
+void registerConvertToTargetEnvPass();
+void registerLazyLoadingTestPasses();
+void registerLoopLikeInterfaceTestPasses();
+void registerPassManagerTestPass();
+void registerPrintSpirvAvailabilityPass();
+void registerRegionTestPasses();
+void registerShapeFunctionTestPasses();
+void registerSideEffectTestPasses();
+void registerSliceAnalysisTestPass();
+void registerSymbolTestPasses();
+void registerTestAffineAccessAnalysisPass();
+void registerTestAffineDataCopyPass();
+void registerTestAffineLoopUnswitchingPass();
+void registerTestAffineReifyValueBoundsPass();
+void registerTestAffineWalk();
+void registerTestBytecodeRoundtripPasses();
+void registerTestDecomposeAffineOpPass();
+void registerTestFunc();
+void registerTestGpuLoweringPasses();
+void registerTestGpuMemoryPromotionPass();
+void registerTestLoopPermutationPass();
+void registerTestMatchers();
+void registerTestOperationEqualPass();
+void registerTestPreserveUseListOrders();
+void registerTestPrintDefUsePass();
+void registerTestPrintInvalidPass();
+void registerTestPrintNestingPass();
+void registerTestReducer();
+void registerTestSpirvEntryPointABIPass();
+void registerTestSpirvModuleCombinerPass();
+void registerTestTraitsPass();
+void registerTosaTestQuantUtilAPIPass();
+void registerVectorizerTestPass();
+
+namespace test {
+void registerCommutativityUtils();
+void registerConvertCallOpPass();
+void registerConvertFuncOpPass();
+void registerInliner();
+void registerMemRefBoundCheck();
+void registerPatternsTestPass();
+void registerSimpleParametricTilingPass();
+void registerTestAffineLoopParametricTilingPass();
+void registerTestAliasAnalysisPass();
+void registerTestArithEmulateWideIntPass();
+void registerTestBuiltinAttributeInterfaces();
+void registerTestBuiltinDistinctAttributes();
+void registerTestCallGraphPass();
+void registerTestCfAssertPass();
+void registerTestCFGLoopInfoPass();
+void registerTestComposeSubView();
+void registerTestCompositePass();
+void registerTestConstantFold();
+void registerTestControlFlowSink();
+void registerTestDataLayoutPropagation();
+void registerTestDataLayoutQuery();
+void registerTestDeadCodeAnalysisPass();
+void registerTestDecomposeCallGraphTypes();
+void registerTestDiagnosticsPass();
+void registerTestDiagnosticsMetadataPass();
+void registerTestDominancePass();
+void registerTestDynamicPipelinePass();
+void registerTestEmulateNarrowTypePass();
+void registerTestExpandMathPass();
+void registerTestFooAnalysisPass();
+void registerTestComposeSubView();
+void registerTestMultiBuffering();
+void registerTestIRVisitorsPass();
+void registerTestGenericIRVisitorsPass();
+void registerTestInterfaces();
+void registerTestIRVisitorsPass();
+void registerTestLastModifiedPass();
+void registerTestLinalgDecomposeOps();
+void registerTestLinalgDropUnitDims();
+void registerTestLinalgElementwiseFusion();
+void registerTestLinalgGreedyFusion();
+void registerTestLinalgRankReduceContractionOps();
+void registerTestLinalgTransforms();
+void registerTestLivenessAnalysisPass();
+void registerTestLivenessPass();
+void registerTestLoopFusion();
+void registerTestLoopMappingPass();
+void registerTestLoopUnrollingPass();
+void registerTestLowerToArmNeon();
+void registerTestLowerToArmSME();
+void registerTestLowerToLLVM();
+void registerTestMakeIsolatedFromAbovePass();
+void registerTestMatchReductionPass();
+void registerTestMathAlgebraicSimplificationPass();
+void registerTestMathPolynomialApproximationPass();
+void registerTestMathToVCIXPass();
+void registerTestMemRefDependenceCheck();
+void registerTestMemRefStrideCalculation();
+void registerTestMeshReshardingSpmdizationPass();
+void registerTestMeshSimplificationsPass();
+void registerTestMultiBuffering();
+void registerTestNextAccessPass();
+void registerTestNVGPULowerings();
+void registerTestOneToNTypeConversionPass();
+void registerTestOpaqueLoc();
+void registerTestOpLoweringPasses();
+void registerTestPadFusion();
+void registerTestRecursiveTypesPass();
+void registerTestSCFUpliftWhileToFor();
+void registerTestSCFUtilsPass();
+void registerTestSCFWhileOpBuilderPass();
+void registerTestSCFWrapInZeroTripCheckPasses();
+void registerTestShapeMappingPass();
+void registerTestSliceAnalysisPass();
+void registerTestSPIRVFuncSignatureConversion();
+void registerTestSPIRVVectorUnrolling();
+void registerTestTensorCopyInsertionPass();
+void registerTestTensorTransforms();
+void registerTestTopologicalSortAnalysisPass();
+void registerTestTransformDialectEraseSchedulePass();
+void registerTestVectorLowerings();
+void registerTestVectorReductionToSPIRVDotProd();
+void registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+void registerTestDialectConversionPasses();
+void registerTestPDLByteCodePass();
+void registerTestPDLLPasses();
+#endif
+} // namespace test
+} // namespace mlir
+
+namespace test {
+void registerTestDialect(DialectRegistry &);
+void registerTestDynDialect(DialectRegistry &);
+void registerTestTilingInterfaceTransformDialectExtension(DialectRegistry &);
+void registerTestTransformDialectExtension(DialectRegistry &);
+} // namespace test
+
+#ifdef MLIR_INCLUDE_TESTS
+void registerTestPasses() {
+  registerCloneTestPasses();
+  registerConvertToTargetEnvPass();
+  registerLazyLoadingTestPasses();
+  registerLoopLikeInterfaceTestPasses();
+  registerPassManagerTestPass();
+  registerPrintSpirvAvailabilityPass();
+  registerRegionTestPasses();
+  registerShapeFunctionTestPasses();
+  registerSideEffectTestPasses();
+  registerSliceAnalysisTestPass();
+  registerSymbolTestPasses();
+  registerTestAffineAccessAnalysisPass();
+  registerTestAffineDataCopyPass();
+  registerTestAffineLoopUnswitchingPass();
+  registerTestAffineReifyValueBoundsPass();
+  registerTestAffineWalk();
+  registerTestBytecodeRoundtripPasses();
+  registerTestDecomposeAffineOpPass();
+  registerTestFunc();
+  registerTestGpuLoweringPasses();
+  registerTestGpuMemoryPromotionPass();
+  registerTestLoopPermutationPass();
+  registerTestMatchers();
+  registerTestOperationEqualPass();
+  registerTestPreserveUseListOrders();
+  registerTestPrintDefUsePass();
+  registerTestPrintInvalidPass();
+  registerTestPrintNestingPass();
+  registerTestReducer();
+  registerTestSpirvEntryPointABIPass();
+  registerTestSpirvModuleCombinerPass();
+  registerTestTraitsPass();
+  registerTosaTestQuantUtilAPIPass();
+  registerVectorizerTestPass();
+
+  mlir::test::registerCommutativityUtils();
+  mlir::test::registerConvertCallOpPass();
+  mlir::test::registerConvertFuncOpPass();
+  mlir::test::registerInliner();
+  mlir::test::registerMemRefBoundCheck();
+  mlir::test::registerPatternsTestPass();
+  mlir::test::registerSimpleParametricTilingPass();
+  mlir::test::registerTestAffineLoopParametricTilingPass();
+  mlir::test::registerTestAliasAnalysisPass();
+  mlir::test::registerTestArithEmulateWideIntPass();
+  mlir::test::registerTestBuiltinAttributeInterfaces();
+  mlir::test::registerTestBuiltinDistinctAttributes();
+  mlir::test::registerTestCallGraphPass();
+  mlir::test::registerTestCfAssertPass();
+  mlir::test::registerTestCFGLoopInfoPass();
+  mlir::test::registerTestComposeSubView();
+  mlir::test::registerTestCompositePass();
+  mlir::test::registerTestConstantFold();
+  mlir::test::registerTestControlFlowSink();
+  mlir::test::registerTestDataLayoutPropagation();
+  mlir::test::registerTestDataLayoutQuery();
+  mlir::test::registerTestDeadCodeAnalysisPass();
+  mlir::test::registerTestDecomposeCallGraphTypes();
+  mlir::test::registerTestDiagnosticsPass();
+  mlir::test::registerTestDiagnosticsMetadataPass();
+  mlir::test::registerTestDominancePass();
+  mlir::test::registerTestDynamicPipelinePass();
+  mlir::test::registerTestEmulateNarrowTypePass();
+  mlir::test::registerTestExpandMathPass();
+  mlir::test::registerTestFooAnalysisPass();
+  mlir::test::registerTestComposeSubView();
+  mlir::test::registerTestMultiBuffering();
+  mlir::test::registerTestIRVisitorsPass();
+  mlir::test::registerTestGenericIRVisitorsPass();
+  mlir::test::registerTestInterfaces();
+  mlir::test::registerTestIRVisitorsPass();
+  mlir::test::registerTestLastModifiedPass();
+  mlir::test::registerTestLinalgDecomposeOps();
+  mlir::test::registerTestLinalgDropUnitDims();
+  mlir::test::registerTestLinalgElementwiseFusion();
+  mlir::test::registerTestLinalgGreedyFusion();
+  mlir::test::registerTestLinalgRankReduceContractionOps();
+  mlir::test::registerTestLinalgTransforms();
+  mlir::test::registerTestLivenessAnalysisPass();
+  mlir::test::registerTestLivenessPass();
+  mlir::test::registerTestLoopFusion();
+  mlir::test::registerTestLoopMappingPass();
+  mlir::test::registerTestLoopUnrollingPass();
+  mlir::test::registerTestLowerToArmNeon();
+  mlir::test::registerTestLowerToArmSME();
+  mlir::test::registerTestLowerToLLVM();
+  mlir::test::registerTestMakeIsolatedFromAbovePass();
+  mlir::test::registerTestMatchReductionPass();
+  mlir::test::registerTestMathAlgebraicSimplificationPass();
+  mlir::test::registerTestMathPolynomialApproximationPass();
+  mlir::test::registerTestMathToVCIXPass();
+  mlir::test::registerTestMemRefDependenceCheck();
+  mlir::test::registerTestMemRefStrideCalculation();
+  mlir::test::registerTestMeshReshardingSpmdizationPass();
+  mlir::test::registerTestMeshSimplificationsPass();
+  mlir::test::registerTestMultiBuffering();
+  mlir::test::registerTestNextAccessPass();
+  mlir::test::registerTestNVGPULowerings();
+  mlir::test::registerTestOneToNTypeConversionPass();
+  mlir::test::registerTestOpaqueLoc();
+  mlir::test::registerTestOpLoweringPasses();
+  mlir::test::registerTestPadFusion();
+  mlir::test::registerTestRecursiveTypesPass();
+  mlir::test::registerTestSCFUpliftWhileToFor();
+  mlir::test::registerTestSCFUtilsPass();
+  mlir::test::registerTestSCFWhileOpBuilderPass();
+  mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
+  mlir::test::registerTestShapeMappingPass();
+  mlir::test::registerTestSliceAnalysisPass();
+  mlir::test::registerTestSPIRVFuncSignatureConversion();
+  mlir::test::registerTestSPIRVVectorUnrolling();
+  mlir::test::registerTestTensorCopyInsertionPass();
+  mlir::test::registerTestTensorTransforms();
+  mlir::test::registerTestTopologicalSortAnalysisPass();
+  mlir::test::registerTestTransformDialectEraseSchedulePass();
+  mlir::test::registerTestVectorLowerings();
+  mlir::test::registerTestVectorReductionToSPIRVDotProd();
+  mlir::test::registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+  mlir::test::registerTestDialectConversionPasses();
+  mlir::test::registerTestPDLByteCodePass();
+  mlir::test::registerTestPDLLPasses();
+#endif
+}
+#endif
+
+int main(int argc, char **argv) {
+  registerAllPasses();
+#ifdef MLIR_INCLUDE_TESTS
+  registerTestPasses();
+#endif
+  DialectRegistry registry;
+  registerAllDialects(registry);
+  registerAllExtensions(registry);
+  registerTosaFuserPipeline();
+
+  // TODO: Remove this and the corresponding MLIRToLLVMIRTranslationRegistration
+  // cmake dependency when a safe dialect interface registration mechanism is
+  // implemented, see D157703 (and corresponding note on the declaration).
+  registerAllGPUToLLVMIRTranslations(registry);
+
+#ifdef MLIR_INCLUDE_TESTS
+  ::test::registerTestDialect(registry);
+  ::test::registerTestTransformDialectExtension(registry);
+  ::test::registerTestTilingInterfaceTransformDialectExtension(registry);
+  ::test::registerTestDynDialect(registry);
+#endif
+  return mlir::asMainReturnCode(mlir::MlirOptMain(
+      argc, argv, "MLIR modular optimizer driver\n", registry));
+}

>From b484e035c7937f5eb5b34c7efa3cd80684e57c41 Mon Sep 17 00:00:00 2001
From: Akshar S Ramesh <aksharctt at gmail.com>
Date: Thu, 5 Sep 2024 10:10:23 +0530
Subject: [PATCH 2/2] Fusion code; Few tests

---
 mlir/include/mlir/Transforms/Passes.h        |   1 +
 mlir/include/mlir/Transforms/Passes.td       |   6 +
 mlir/lib/Registration/Pipelines.cpp          |   3 +
 mlir/lib/Transforms/CMakeLists.txt           |   3 +
 mlir/lib/Transforms/TosaAffineFusion.cpp     | 186 ++++++
 tests/PAF_tosa_add_sub.mlir                  | 309 ++++++++++
 tests/PAF_tosa_reduce_max_min.mlir           | 425 +++++++++++++
 tests/PAF_tosa_reduce_max_min_diff_axis.mlir | 589 +++++++++++++++++++
 tests/tosa_add_sub.mlir                      |   5 +
 tests/tosa_reduce_max_min.mlir               |   5 +
 tests/tosa_reduce_max_min_diff_axis.mlir     |   5 +
 11 files changed, 1537 insertions(+)
 create mode 100644 mlir/lib/Transforms/TosaAffineFusion.cpp
 create mode 100644 tests/PAF_tosa_add_sub.mlir
 create mode 100644 tests/PAF_tosa_reduce_max_min.mlir
 create mode 100644 tests/PAF_tosa_reduce_max_min_diff_axis.mlir
 create mode 100644 tests/tosa_add_sub.mlir
 create mode 100644 tests/tosa_reduce_max_min.mlir
 create mode 100644 tests/tosa_reduce_max_min_diff_axis.mlir

diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 8e4a43c3f24586..19aa4960955618 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -138,6 +138,7 @@ std::unique_ptr<Pass> createCompositeFixedPointPass(
     std::string name, llvm::function_ref<void(OpPassManager &)> populateFunc,
     int maxIterations = 10);
 
+std::unique_ptr<Pass> createTosaAffineFusionPass();
 //===----------------------------------------------------------------------===//
 // Registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 000d9f697618e6..95e2cd8959483a 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -577,4 +577,10 @@ def CompositeFixedPointPass : Pass<"composite-fixed-point-pass"> {
   ];
 }
 
+def TosaAffineFusion : Pass<"tosa-affine-fusion"> {
+  let summary = "Fuse Loops lowered from tosa";
+  let constructor = "mlir::createTosaAffineFusionPass()";
+  let dependentDialects = ["affine::AffineDialect"];
+}
+
 #endif // MLIR_TRANSFORMS_PASSES
diff --git a/mlir/lib/Registration/Pipelines.cpp b/mlir/lib/Registration/Pipelines.cpp
index 43095b154e0ef9..e2687cc95c0616 100644
--- a/mlir/lib/Registration/Pipelines.cpp
+++ b/mlir/lib/Registration/Pipelines.cpp
@@ -37,8 +37,11 @@ void mlir::createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOpt
     pm.addPass(bufferization::createEmptyTensorEliminationPass());
     pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
     pm.addPass(bufferization::createOneShotBufferizePass());
+    pm.addPass(func::createFuncBufferizePass());
     pm.addPass(createCanonicalizerPass());
     pm.addPass(createConvertLinalgToAffineLoopsPass());
+
+    pm.addPass(createTosaAffineFusionPass());
 }
 
 static void tosaFuser3(OpPassManager &pm, const TosaFuserPipelineOptions &options) {
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 90c0298fb5e46a..4519619f2652a6 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_library(MLIRTransforms
   SymbolPrivatize.cpp
   TopologicalSort.cpp
   ViewOpGraph.cpp
+  TosaAffineFusion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
@@ -29,6 +30,8 @@ add_mlir_library(MLIRTransforms
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
+  MLIRAffineDialect
+  MLIRFuncDialect
   MLIRCopyOpInterface
   MLIRFunctionInterfaces
   MLIRLoopLikeInterface
diff --git a/mlir/lib/Transforms/TosaAffineFusion.cpp b/mlir/lib/Transforms/TosaAffineFusion.cpp
new file mode 100644
index 00000000000000..81963e5eeba859
--- /dev/null
+++ b/mlir/lib/Transforms/TosaAffineFusion.cpp
@@ -0,0 +1,186 @@
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/Affine/Analysis/Utils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_TOSAAFFINEFUSION
+#include "mlir/Transforms/Passes.h.inc"
+}
+
+#define DEBUG_TYPE "tosa-affine-fusion"
+
+using namespace mlir;
+using namespace mlir::affine;
+
+namespace {
+class TosaAffineFusion : public mlir::impl::TosaAffineFusionBase<TosaAffineFusion> {
+
+public :
+    TosaAffineFusion() = default;
+    void runOnOperation() override;
+    bool checkFusibility(AffineForOp *dstLoop, AffineForOp *srcLoop);
+    void moveIntermediateOps(AffineForOp *dstLoop, AffineForOp *srcLoop);
+    void fuseSiblingLoops(AffineForOp *dstLoop, AffineForOp *srcLoop);
+    bool useInsideLoop(Operation *user, AffineForOp *srcLoop);
+    void fuseLoopsInBlock(Block *block);
+};
+
+bool TosaAffineFusion::checkFusibility(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+    if (dstLoop->getOperation() == srcLoop->getOperation()) {
+        llvm::errs()<<"[CHECKFUSIBILITY LOG] Same Loop\n";
+        return false;
+    }
+
+    if (dstLoop->getOperation()->getParentOp() != srcLoop->getOperation()->getParentOp()) {
+        llvm::errs()<<"[CHECKFUSIBILITY LOG] Parent is not same\n";
+        return false;
+    }
+
+    if (dstLoop->getConstantLowerBound() != srcLoop->getConstantLowerBound()) {
+        llvm::errs()<<"[CHECKFUSIBILITY LOG] Lower Bound is not same\n";
+        return false;
+    }
+
+    if (dstLoop->getConstantUpperBound() != srcLoop->getConstantUpperBound()) {
+        llvm::errs()<<"[CHECKFUSIBILITY LOG] Upper Bound is not same\n";
+        return false;
+    }
+
+    if (dstLoop->getStepAsInt() != srcLoop->getStepAsInt()) {
+        llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+        return false;
+    }
+
+    llvm::errs()<<"[CHECKFUSIBILITY LOG] SUCCESS\n";
+    return true;
+}
+
+bool TosaAffineFusion::useInsideLoop(Operation *user, AffineForOp *srcLoop) {
+    while (!isa<func::FuncOp>(user->getParentOp())) {
+        auto *parentOp = user->getParentOp();
+        if (user->getParentOp() == srcLoop->getOperation())
+            return true;
+        user = parentOp;
+    }
+    return false;
+}
+
+void TosaAffineFusion::moveIntermediateOps(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+    auto *block = dstLoop->getOperation()->getBlock();
+    bool dstLoopFound = false;
+    for (auto &op : block->getOperations()) {
+        if (&op == dstLoop->getOperation()) {
+            dstLoopFound = true;
+            continue;
+        }
+        if (!dstLoopFound)
+            continue;
+        if (&op == srcLoop->getOperation())
+            break;
+        for (auto *user : op.getUsers())
+            if (useInsideLoop(user, srcLoop))
+                op.moveBefore(dstLoop->getOperation());
+    }
+}
+
+void TosaAffineFusion::fuseSiblingLoops(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+    IRMapping map;
+    map.map(srcLoop->getInductionVar(), dstLoop->getInductionVar());
+    OpBuilder builder(*dstLoop);
+    builder.setInsertionPoint(dstLoop->getBody()->getTerminator());
+
+    for (auto &op : srcLoop->getBody()->getOperations()) {
+        if (&op == srcLoop->getBody()->getTerminator())
+            continue;
+        builder.clone(op, map);
+    }
+}
+
+void TosaAffineFusion::fuseLoopsInBlock(Block *block) {
+    auto affineFors = block->getOps<AffineForOp>();
+    SmallVector<AffineForOp, 4> siblingAffineFors{affineFors.begin(), affineFors.end()};
+
+    for (auto dstLoop : siblingAffineFors) {
+        if (!dstLoop.getOperation()) {
+            llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 1 - DstLoop refernce dropped\n";
+            continue;
+        }
+        llvm::errs()<<"[FUSELOOPSINBLOCK LOG] DstLoop -> \n";
+        dstLoop.dump();
+        if (dstLoop->getParentOp() == nullptr) {
+            llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped\n";
+            continue;
+        }
+        for (auto srcLoop : siblingAffineFors) {
+            if (!srcLoop.getOperation()) {
+                llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 1 - SrcLoop refernce dropped\n";
+                continue;
+            }
+            llvm::errs()<<"[FUSELOOPSINBLOCK LOG] SrcLoop -> \n";
+            srcLoop.dump();
+            if (srcLoop->getParentOp() == nullptr) {
+                llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped\n";
+                continue;
+            }
+            if (!checkFusibility(&dstLoop, &srcLoop))
+                continue;
+
+            llvm::errs()<<"[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE\n";
+
+            moveIntermediateOps(&dstLoop, &srcLoop);
+
+            fuseSiblingLoops(&dstLoop, &srcLoop);
+
+            srcLoop->dropAllReferences();
+            srcLoop->remove();
+
+            llvm::errs()<<"[CHECKFUSIBILITY LOG] New FUSED DSTLoop\n";
+            dstLoop.dump();
+        }
+
+        for (Region &region : dstLoop->getRegions()) {
+            for (Block &block : region.getBlocks()) {
+                auto affineFors = block.getOps<AffineForOp>();
+                if (!affineFors.empty() && !llvm::hasSingleElement(affineFors)) {
+                                llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+
+                    fuseLoopsInBlock(&block);
+                }
+            }
+        }
+    }
+    llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+}
+
+void TosaAffineFusion::runOnOperation() {
+    getOperation()->walk([&](Operation *op) {
+        for (Region &region : op->getRegions()) {
+            for (Block &block : region.getBlocks()) {
+                auto affineFors = block.getOps<AffineForOp>();
+                if (!affineFors.empty() && !llvm::hasSingleElement(affineFors)) {
+                    fuseLoopsInBlock(&block);
+                }
+            }
+        }
+    });
+}
+
+} // end of namespace
+
+std::unique_ptr<Pass> mlir::createTosaAffineFusionPass() {
+    return std::make_unique<TosaAffineFusion>();
+}
\ No newline at end of file
diff --git a/tests/PAF_tosa_add_sub.mlir b/tests/PAF_tosa_add_sub.mlir
new file mode 100644
index 00000000000000..b5c05d74c7c55c
--- /dev/null
+++ b/tests/PAF_tosa_add_sub.mlir
@@ -0,0 +1,309 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+  %0 = tensor.empty() : tensor<2x32xf32>
+  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %4 = arith.addf %in, %in_0 : f32
+    linalg.yield %4 : f32
+  } -> tensor<2x32xf32>
+  %2 = tensor.empty() : tensor<2x32xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %4 = arith.addf %in, %in_0 : f32
+    linalg.yield %4 : f32
+  } -> tensor<2x32xf32>
+  return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+    %0 = tensor.empty() : tensor<2x32xf32>
+    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %4 = arith.addf %in, %in_0 : f32
+      linalg.yield %4 : f32
+    } -> tensor<2x32xf32>
+    %2 = tensor.empty() : tensor<2x32xf32>
+    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %4 = arith.addf %in, %in_0 : f32
+      linalg.yield %4 : f32
+    } -> tensor<2x32xf32>
+    return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+  }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+  %0 = bufferization.alloc_tensor() : tensor<2x32xf32>
+  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %4 = arith.addf %in, %in_0 : f32
+    linalg.yield %4 : f32
+  } -> tensor<2x32xf32>
+  %2 = bufferization.alloc_tensor() : tensor<2x32xf32>
+  %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+  ^bb0(%in: f32, %in_0: f32, %out: f32):
+    %4 = arith.addf %in, %in_0 : f32
+    linalg.yield %4 : f32
+  } -> tensor<2x32xf32>
+  return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+    %0 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %1 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %2 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %3 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%3, %2 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %6 = arith.addf %in, %in_1 : f32
+      linalg.yield %6 : f32
+    }
+    %4 = bufferization.to_tensor %alloc : memref<2x32xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_0 : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %6 = arith.addf %in, %in_1 : f32
+      linalg.yield %6 : f32
+    }
+    %5 = bufferization.to_tensor %alloc_0 : memref<2x32xf32>
+    return %4, %5 : tensor<2x32xf32>, tensor<2x32xf32>
+  }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+    %0 = bufferization.to_tensor %arg1 : memref<2x32xf32>
+    %1 = bufferization.to_tensor %arg0 : memref<2x32xf32>
+    %2 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %3 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %4 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %5 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %4 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %10 = arith.addf %in, %in_1 : f32
+      linalg.yield %10 : f32
+    }
+    %6 = bufferization.to_tensor %alloc : memref<2x32xf32>
+    %7 = bufferization.to_memref %6 : memref<2x32xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%3, %2 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_0 : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %10 = arith.addf %in, %in_1 : f32
+      linalg.yield %10 : f32
+    }
+    %8 = bufferization.to_tensor %alloc_0 : memref<2x32xf32>
+    %9 = bufferization.to_memref %8 : memref<2x32xf32>
+    return %7, %9 : memref<2x32xf32>, memref<2x32xf32>
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<2x32xf32>, memref<2x32xf32>) outs(%alloc : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %0 = arith.addf %in, %in_1 : f32
+      linalg.yield %0 : f32
+    }
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<2x32xf32>, memref<2x32xf32>) outs(%alloc_0 : memref<2x32xf32>) {
+    ^bb0(%in: f32, %in_1: f32, %out: f32):
+      %0 = arith.addf %in, %in_1 : f32
+      linalg.yield %0 : f32
+    }
+    return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+  }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+  }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+  }
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %2 = arith.addf %0, %1 : f32
+    affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %2 = arith.addf %0, %1 : f32
+  affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %2 = arith.addf %0, %1 : f32
+  affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %2 = arith.addf %0, %1 : f32
+  affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %2 = arith.addf %0, %1 : f32
+  affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+  %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %5 = arith.addf %3, %4 : f32
+  affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+  %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %2 = "arith.addf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+  ^bb0(%arg1: index):
+    %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %2 = "arith.addf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+        %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %5 = arith.addf %3, %4 : f32
+        affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+  }
+}
+
+
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %2 = arith.addf %0, %1 : f32
+        affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+        %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %5 = arith.addf %3, %4 : f32
+        affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+  }
+}
+
diff --git a/tests/PAF_tosa_reduce_max_min.mlir b/tests/PAF_tosa_reduce_max_min.mlir
new file mode 100644
index 00000000000000..3325128cab1953
--- /dev/null
+++ b/tests/PAF_tosa_reduce_max_min.mlir
@@ -0,0 +1,425 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+  %0 = tensor.empty() : tensor<32xf32>
+  %cst = arith.constant -3.40282347E+38 : f32
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+  %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.maximumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+  %2 = tensor.empty() : tensor<32xf32>
+  %cst_0 = arith.constant 3.40282347E+38 : f32
+  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+  %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.minimumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+  return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+module {
+  func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+    %0 = tensor.empty() : tensor<32xf32>
+    %cst = arith.constant -3.40282347E+38 : f32
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+    %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.maximumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+    %2 = tensor.empty() : tensor<32xf32>
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+    %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.minimumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+    return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+  %cst = arith.constant 3.40282347E+38 : f32
+  %cst_0 = arith.constant -3.40282347E+38 : f32
+  %0 = bufferization.alloc_tensor() : tensor<32xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+  %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.maximumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+  %2 = bufferization.alloc_tensor() : tensor<32xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+  %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.minimumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+  return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+module {
+  func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+    %0 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %1 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %cst = arith.constant 3.40282347E+38 : f32
+    %cst_0 = arith.constant -3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<32xf32>)
+    linalg.reduce ins(%1 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.maximumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %2 = bufferization.to_tensor %expand_shape : memref<1x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<32xf32>)
+    linalg.reduce ins(%0 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_1 : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.minimumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %3 = bufferization.to_tensor %expand_shape_2 : memref<1x32xf32>
+    return %2, %3 : tensor<1x32xf32>, tensor<1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+    %0 = bufferization.to_tensor %arg1 : memref<2x32xf32>
+    %1 = bufferization.to_tensor %arg0 : memref<2x32xf32>
+    %2 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %3 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %cst = arith.constant 3.40282347E+38 : f32
+    %cst_0 = arith.constant -3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<32xf32>)
+    linalg.reduce ins(%3 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %8 = arith.maximumf %in, %init : f32
+        linalg.yield %8 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %4 = bufferization.to_tensor %expand_shape : memref<1x32xf32>
+    %5 = bufferization.to_memref %4 : memref<1x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<32xf32>)
+    linalg.reduce ins(%2 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_1 : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %8 = arith.minimumf %in, %init : f32
+        linalg.yield %8 : f32
+      }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %6 = bufferization.to_tensor %expand_shape_2 : memref<1x32xf32>
+    %7 = bufferization.to_memref %6 : memref<1x32xf32>
+    return %5, %7 : memref<1x32xf32>, memref<1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %cast = memref.cast %arg1 : memref<2x32xf32> to memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %cast_1 = memref.cast %arg0 : memref<2x32xf32> to memref<2x32xf32, strided<[?, ?], offset: ?>>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc : memref<32xf32>)
+    linalg.reduce ins(%cast_1 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %0 = arith.maximumf %in, %init : f32
+        linalg.yield %0 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc_2 : memref<32xf32>)
+    linalg.reduce ins(%cast : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_2 : memref<32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %0 = arith.minimumf %in, %init : f32
+        linalg.yield %0 : f32
+      }
+    %expand_shape_3 = memref.expand_shape %alloc_2 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    return %expand_shape, %expand_shape_3 : memref<1x32xf32>, memref<1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    affine.for %arg2 = 0 to 32 {
+      affine.store %cst, %alloc[%arg2] : memref<32xf32>
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %alloc[%arg3] : memref<32xf32>
+        %2 = arith.maximumf %0, %1 : f32
+        affine.store %2, %alloc[%arg3] : memref<32xf32>
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    affine.for %arg2 = 0 to 32 {
+      affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+        %2 = arith.minimumf %0, %1 : f32
+        affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+      }
+    }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+  }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg2 = 0 to 32 {
+  affine.store %cst, %alloc[%arg2] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 32 {
+  affine.store %cst, %alloc[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc[%arg3] : memref<32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3] : memref<32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 32 {
+  affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 32 {
+  affine.store %cst, %alloc[%arg2] : memref<32xf32>
+  affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+    %2 = arith.minimumf %0, %1 : f32
+    affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc[%arg3] : memref<32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3] : memref<32xf32>
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 32 {
+  affine.store %cst, %alloc[%arg2] : memref<32xf32>
+  affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc[%arg3] : memref<32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3] : memref<32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+  "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+    %2 = arith.minimumf %0, %1 : f32
+    affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc[%arg3] : memref<32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3] : memref<32xf32>
+  }
+  affine.for %arg3 = 0 to 32 {
+    %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+    %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+    %2 = arith.minimumf %0, %1 : f32
+    affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %alloc[%arg3] : memref<32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %alloc[%arg3] : memref<32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+  %2 = arith.minimumf %0, %1 : f32
+  affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+  %1 = affine.load %alloc[%arg3] : memref<32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3] : memref<32xf32>
+  %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+  %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+  %5 = arith.minimumf %3, %4 : f32
+  affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+  %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+  "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+  ^bb0(%arg1: index):
+    %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    affine.for %arg2 = 0 to 32 {
+      affine.store %cst, %alloc[%arg2] : memref<32xf32>
+      affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %alloc[%arg3] : memref<32xf32>
+        %2 = arith.maximumf %0, %1 : f32
+        affine.store %2, %alloc[%arg3] : memref<32xf32>
+        %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+        %5 = arith.minimumf %3, %4 : f32
+        affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+  }
+}
+
+
+module {
+  func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+    affine.for %arg2 = 0 to 32 {
+      affine.store %cst, %alloc[%arg2] : memref<32xf32>
+      affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+        %1 = affine.load %alloc[%arg3] : memref<32xf32>
+        %2 = arith.maximumf %0, %1 : f32
+        affine.store %2, %alloc[%arg3] : memref<32xf32>
+        %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+        %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+        %5 = arith.minimumf %3, %4 : f32
+        affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+  }
+}
+
diff --git a/tests/PAF_tosa_reduce_max_min_diff_axis.mlir b/tests/PAF_tosa_reduce_max_min_diff_axis.mlir
new file mode 100644
index 00000000000000..3e92ea17e38da5
--- /dev/null
+++ b/tests/PAF_tosa_reduce_max_min_diff_axis.mlir
@@ -0,0 +1,589 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+  %0 = tensor.empty() : tensor<4x32xf32>
+  %cst = arith.constant -3.40282347E+38 : f32
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+  %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.maximumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+  %2 = tensor.empty() : tensor<2x32xf32>
+  %cst_0 = arith.constant 3.40282347E+38 : f32
+  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+  %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1] 
+    (%in: f32, %init: f32) {
+      %4 = arith.minimumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+  return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+module {
+  func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+    %0 = tensor.empty() : tensor<4x32xf32>
+    %cst = arith.constant -3.40282347E+38 : f32
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+    %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.maximumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+    %2 = tensor.empty() : tensor<2x32xf32>
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+    %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1] 
+      (%in: f32, %init: f32) {
+        %4 = arith.minimumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+    return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+  %cst = arith.constant 3.40282347E+38 : f32
+  %cst_0 = arith.constant -3.40282347E+38 : f32
+  %0 = bufferization.alloc_tensor() : tensor<4x32xf32>
+  %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+  %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0] 
+    (%in: f32, %init: f32) {
+      %4 = arith.maximumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+  %2 = bufferization.alloc_tensor() : tensor<2x32xf32>
+  %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+  %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1] 
+    (%in: f32, %init: f32) {
+      %4 = arith.minimumf %in, %init : f32
+      linalg.yield %4 : f32
+    }
+  %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+  return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+module {
+  func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+    %0 = bufferization.to_memref %arg1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %1 = bufferization.to_memref %arg0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %cst = arith.constant 3.40282347E+38 : f32
+    %cst_0 = arith.constant -3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<4x32xf32>)
+    linalg.reduce ins(%1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %4 = arith.maximumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %2 = bufferization.to_tensor %expand_shape : memref<1x4x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<2x32xf32>)
+    linalg.reduce ins(%0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_1 : memref<2x32xf32>) dimensions = [1] 
+      (%in: f32, %init: f32) {
+        %4 = arith.minimumf %in, %init : f32
+        linalg.yield %4 : f32
+      }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    %3 = bufferization.to_tensor %expand_shape_2 : memref<2x1x32xf32>
+    return %2, %3 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+    %0 = bufferization.to_tensor %arg1 : memref<2x4x32xf32>
+    %1 = bufferization.to_tensor %arg0 : memref<2x4x32xf32>
+    %2 = bufferization.to_memref %0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %3 = bufferization.to_memref %1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %cst = arith.constant 3.40282347E+38 : f32
+    %cst_0 = arith.constant -3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<4x32xf32>)
+    linalg.reduce ins(%3 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %8 = arith.maximumf %in, %init : f32
+        linalg.yield %8 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %4 = bufferization.to_tensor %expand_shape : memref<1x4x32xf32>
+    %5 = bufferization.to_memref %4 : memref<1x4x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<2x32xf32>)
+    linalg.reduce ins(%2 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_1 : memref<2x32xf32>) dimensions = [1] 
+      (%in: f32, %init: f32) {
+        %8 = arith.minimumf %in, %init : f32
+        linalg.yield %8 : f32
+      }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    %6 = bufferization.to_tensor %expand_shape_2 : memref<2x1x32xf32>
+    %7 = bufferization.to_memref %6 : memref<2x1x32xf32>
+    return %5, %7 : memref<1x4x32xf32>, memref<2x1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %cast = memref.cast %arg1 : memref<2x4x32xf32> to memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %cast_1 = memref.cast %arg0 : memref<2x4x32xf32> to memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    linalg.fill ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
+    linalg.reduce ins(%cast_1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0] 
+      (%in: f32, %init: f32) {
+        %0 = arith.maximumf %in, %init : f32
+        linalg.yield %0 : f32
+      }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    linalg.fill ins(%cst_0 : f32) outs(%alloc_2 : memref<2x32xf32>)
+    linalg.reduce ins(%cast : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_2 : memref<2x32xf32>) dimensions = [1] 
+      (%in: f32, %init: f32) {
+        %0 = arith.minimumf %in, %init : f32
+        linalg.yield %0 : f32
+      }
+    %expand_shape_3 = memref.expand_shape %alloc_2 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    return %expand_shape, %expand_shape_3 : memref<1x4x32xf32>, memref<2x1x32xf32>
+  }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    affine.for %arg2 = 0 to 4 {
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+      }
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 4 {
+        affine.for %arg4 = 0 to 32 {
+          %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+          %2 = arith.maximumf %0, %1 : f32
+          affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+        }
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 4 {
+        affine.for %arg4 = 0 to 32 {
+          %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+          %2 = arith.minimumf %0, %1 : f32
+          affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+        }
+      }
+    }
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+  }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg2 = 0 to 4 {
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 4 {
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+      %2 = arith.maximumf %0, %1 : f32
+      affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    }
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+      %2 = arith.minimumf %0, %1 : f32
+      affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    }
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+      %2 = arith.maximumf %0, %1 : f32
+      affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    }
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 4 {
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+      %2 = arith.maximumf %0, %1 : f32
+      affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    }
+  }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+      %2 = arith.maximumf %0, %1 : f32
+      affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    }
+  }
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+      %2 = arith.minimumf %0, %1 : f32
+      affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    }
+  }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+      %2 = arith.maximumf %0, %1 : f32
+      affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    }
+  }
+  affine.for %arg3 = 0 to 32 {
+    affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+  }
+  affine.for %arg3 = 0 to 4 {
+    affine.for %arg4 = 0 to 32 {
+      %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+      %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+      %2 = arith.minimumf %0, %1 : f32
+      affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    }
+  }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg3 = 0 to 4 {
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+  }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 4 {
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 4 {
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    %2 = arith.minimumf %0, %1 : f32
+    affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 4 {
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+  }
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    %2 = arith.minimumf %0, %1 : f32
+    affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg4 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+  %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg4 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+  %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg4 = 0 to 32 {
+  %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+  %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+  %2 = arith.minimumf %0, %1 : f32
+  affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg4 = 0 to 32 {
+  %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+  %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+  %2 = arith.maximumf %0, %1 : f32
+  affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+  %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+  %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+  %5 = arith.minimumf %3, %4 : f32
+  affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+  %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+  "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+affine.for %arg3 = 0 to 32 {
+  affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 4 {
+  affine.for %arg4 = 0 to 32 {
+    %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+    %2 = arith.maximumf %0, %1 : f32
+    affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+    %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+    %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+    %5 = arith.minimumf %3, %4 : f32
+    affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+  }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+affine.for %arg3 = 0 to 32 {
+  affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+  ^bb0(%arg1: index):
+    %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+  ^bb0(%arg1: index):
+    %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+    "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+  ^bb0(%arg1: index):
+    "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop -> 
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+  "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+  ^bb0(%arg1: index):
+    "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+    ^bb0(%arg2: index):
+      %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+      %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+      %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+      "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+      "affine.yield"() : () -> ()
+    }) : () -> ()
+    "affine.yield"() : () -> ()
+  }) : () -> ()
+  "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+  func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    affine.for %arg2 = 0 to 4 {
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+      }
+    }
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 4 {
+        affine.for %arg4 = 0 to 32 {
+          %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+          %2 = arith.maximumf %0, %1 : f32
+          affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+          %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+          %5 = arith.minimumf %3, %4 : f32
+          affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+        }
+      }
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+  }
+}
+
+
+module {
+  func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+    %cst = arith.constant -3.40282347E+38 : f32
+    %cst_0 = arith.constant 3.40282347E+38 : f32
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+    affine.for %arg2 = 0 to 4 {
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+      }
+    }
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+    affine.for %arg2 = 0 to 2 {
+      affine.for %arg3 = 0 to 4 {
+        affine.for %arg4 = 0 to 32 {
+          %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+          %2 = arith.maximumf %0, %1 : f32
+          affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+          %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+          %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+          %5 = arith.minimumf %3, %4 : f32
+          affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+        }
+      }
+      affine.for %arg3 = 0 to 32 {
+        affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+      }
+    }
+    %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+    %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+    return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+  }
+}
+
diff --git a/tests/tosa_add_sub.mlir b/tests/tosa_add_sub.mlir
new file mode 100644
index 00000000000000..49c4ed9d9684da
--- /dev/null
+++ b/tests/tosa_add_sub.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+  %0 = tosa.add %arg0, %arg1 : (tensor<2x32xf32>, tensor<2x32xf32>) -> tensor<2x32xf32>
+  %1 = tosa.add %arg0, %arg1 : (tensor<2x32xf32>, tensor<2x32xf32>) -> tensor<2x32xf32>
+  return %0, %1 : tensor<2x32xf32>, tensor<2x32xf32>
+}
\ No newline at end of file
diff --git a/tests/tosa_reduce_max_min.mlir b/tests/tosa_reduce_max_min.mlir
new file mode 100644
index 00000000000000..ef5c6fa78aab5e
--- /dev/null
+++ b/tests/tosa_reduce_max_min.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+  %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<2x32xf32>) -> tensor<1x32xf32>
+  %1 = tosa.reduce_min %arg1 {axis = 0 : i32} : (tensor<2x32xf32>) -> tensor<1x32xf32>
+  return %0, %1 : tensor<1x32xf32>, tensor<1x32xf32>
+}
\ No newline at end of file
diff --git a/tests/tosa_reduce_max_min_diff_axis.mlir b/tests/tosa_reduce_max_min_diff_axis.mlir
new file mode 100644
index 00000000000000..d7a97a414b6595
--- /dev/null
+++ b/tests/tosa_reduce_max_min_diff_axis.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+  %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<2x4x32xf32>) -> tensor<1x4x32xf32>
+  %1 = tosa.reduce_min %arg1 {axis = 1 : i32} : (tensor<2x4x32xf32>) -> tensor<2x1x32xf32>
+  return %0, %1 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
\ No newline at end of file