[Mlir-commits] [llvm] [mlir] [tosa-fuser] Affine Fusion Pass (PR #107383)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Sep 5 04:33:17 PDT 2024
https://github.com/asr-compiler created https://github.com/llvm/llvm-project/pull/107383
None
>From 268eb827ab3a3fddcd284686cd83ef8fee253b6e Mon Sep 17 00:00:00 2001
From: Akshar S Ramesh <aksharctt at gmail.com>
Date: Wed, 4 Sep 2024 12:31:57 +0530
Subject: [PATCH 1/2] Add tosa-fuser-opt tool
---
mlir/include/mlir/Registration/Pipelines.h | 11 +
mlir/lib/CMakeLists.txt | 1 +
mlir/lib/Registration/CMakeLists.txt | 13 +
mlir/lib/Registration/Pipelines.cpp | 54 ++++
mlir/tools/CMakeLists.txt | 1 +
mlir/tools/tosa-fuser-opt/CMakeLists.txt | 111 +++++++
mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp | 321 +++++++++++++++++++
7 files changed, 512 insertions(+)
create mode 100644 mlir/include/mlir/Registration/Pipelines.h
create mode 100644 mlir/lib/Registration/CMakeLists.txt
create mode 100644 mlir/lib/Registration/Pipelines.cpp
create mode 100644 mlir/tools/tosa-fuser-opt/CMakeLists.txt
create mode 100644 mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp
diff --git a/mlir/include/mlir/Registration/Pipelines.h b/mlir/include/mlir/Registration/Pipelines.h
new file mode 100644
index 00000000000000..7a063b354bd862
--- /dev/null
+++ b/mlir/include/mlir/Registration/Pipelines.h
@@ -0,0 +1,11 @@
+#include "mlir/Pass/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include <cstdint>
+
+namespace mlir {
+ struct TosaFuserPipelineOptions : public PassPipelineOptions<TosaFuserPipelineOptions> {
+
+ };
+ void createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOptions &options, unsigned optLevel);
+ void registerTosaFuserPipeline();
+}
\ No newline at end of file
diff --git a/mlir/lib/CMakeLists.txt b/mlir/lib/CMakeLists.txt
index d25c84a3975db4..435c1dfd0eb97d 100644
--- a/mlir/lib/CMakeLists.txt
+++ b/mlir/lib/CMakeLists.txt
@@ -20,3 +20,4 @@ add_subdirectory(Target)
add_subdirectory(Tools)
add_subdirectory(Transforms)
add_subdirectory(ExecutionEngine)
+add_subdirectory(Registration)
diff --git a/mlir/lib/Registration/CMakeLists.txt b/mlir/lib/Registration/CMakeLists.txt
new file mode 100644
index 00000000000000..4d08e615c062d7
--- /dev/null
+++ b/mlir/lib/Registration/CMakeLists.txt
@@ -0,0 +1,13 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+
+add_mlir_library(TosaFuserRegistration
+ Pipelines.cpp
+
+ LINK_LIBS PUBLIC
+ ${dialect_libs}
+ ${conversion_libs}
+ MLIRPass
+ MLIRTransforms
+ MLIRGPUTransforms
+)
\ No newline at end of file
diff --git a/mlir/lib/Registration/Pipelines.cpp b/mlir/lib/Registration/Pipelines.cpp
new file mode 100644
index 00000000000000..43095b154e0ef9
--- /dev/null
+++ b/mlir/lib/Registration/Pipelines.cpp
@@ -0,0 +1,54 @@
+#include "mlir/Registration/Pipelines.h"
+#include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/FuncToSPIRV/FuncToSPIRV.h"
+#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRV.h"
+#include "mlir/Conversion/Passes.h"
+#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
+#include "mlir/Conversion/SCFToSPIRV/SCFToSPIRV.h"
+#include "mlir/Conversion/VectorToSPIRV/VectorToSPIRVPass.h"
+
+#include "mlir/Dialect/Affine/Passes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Func/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Process.h"
+#include <optional>
+
+using namespace mlir;
+
+void mlir::createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOptions &options,
+ unsigned optLevel) {
+ pm.addNestedPass<func::FuncOp>(tosa::createTosaToLinalg());
+ pm.addPass(bufferization::createEmptyTensorEliminationPass());
+ pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
+ pm.addPass(bufferization::createOneShotBufferizePass());
+ pm.addPass(createCanonicalizerPass());
+ pm.addPass(createConvertLinalgToAffineLoopsPass());
+}
+
+static void tosaFuser3(OpPassManager &pm, const TosaFuserPipelineOptions &options) {
+ createTosaFuserPipeline(pm, options, 3);
+}
+
+void mlir::registerTosaFuserPipeline () {
+ static bool init_once = []() {
+ PassPipelineRegistration<TosaFuserPipelineOptions>(
+ "O3", "Tosa-Fuser Pipeline O3", tosaFuser3);
+ return true;
+ }();
+}
\ No newline at end of file
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
index 9b474385fdae18..01d80f5743fdeb 100644
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(mlir-translate)
add_subdirectory(mlir-vulkan-runner)
add_subdirectory(tblgen-lsp-server)
add_subdirectory(tblgen-to-irdl)
+add_subdirectory(tosa-fuser-opt)
# mlir-cpu-runner requires ExecutionEngine.
if(MLIR_ENABLE_EXECUTION_ENGINE)
diff --git a/mlir/tools/tosa-fuser-opt/CMakeLists.txt b/mlir/tools/tosa-fuser-opt/CMakeLists.txt
new file mode 100644
index 00000000000000..965cc0138e5b4e
--- /dev/null
+++ b/mlir/tools/tosa-fuser-opt/CMakeLists.txt
@@ -0,0 +1,111 @@
+set(LLVM_OPTIONAL_SOURCES
+ null.cpp
+)
+
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+set(LLVM_LINK_COMPONENTS
+ Core
+ Support
+ AsmParser
+ )
+
+if(MLIR_INCLUDE_TESTS)
+ set(test_libs
+ ${cuda_test_libs}
+ MLIRTestFuncToLLVM
+ MLIRAffineTransformsTestPasses
+ MLIRArithTestPasses
+ MLIRArmNeonTestPasses
+ MLIRArmSMETestPasses
+ MLIRBufferizationTestPasses
+ MLIRControlFlowTestPasses
+ MLIRDLTITestPasses
+ MLIRFuncTestPasses
+ MLIRGPUTestPasses
+ MLIRLinalgTestPasses
+ MLIRLoopLikeInterfaceTestPasses
+ MLIRMathTestPasses
+ MLIRTestMathToVCIX
+ MLIRMemRefTestPasses
+ MLIRMeshTest
+ MLIRNVGPUTestPasses
+ MLIRSCFTestPasses
+ MLIRShapeTestPasses
+ MLIRSPIRVTestPasses
+ MLIRTensorTestPasses
+ MLIRTestAnalysis
+ MLIRTestConvertToSPIRV
+ MLIRTestDialect
+ MLIRTestDynDialect
+ MLIRTestIR
+ MLIRTestOneToNTypeConversionPass
+ MLIRTestPass
+ MLIRTestReducer
+ MLIRTestTransforms
+ MLIRTilingInterfaceTestPasses
+ MLIRVectorTestPasses
+ MLIRTestVectorToSPIRV
+ MLIRLLVMTestPasses
+ )
+ set(test_libs ${test_libs}
+ MLIRTestPDLL
+ MLIRTestTransformDialect
+ )
+
+ if (MLIR_ENABLE_PDL_IN_PATTERNMATCH)
+ set(test_libs ${test_libs}
+ MLIRTestPDLL
+ MLIRTestRewrite
+ )
+ endif()
+endif()
+
+set(LIBS
+ ${dialect_libs}
+ ${conversion_libs}
+ ${extension_libs}
+ ${test_libs}
+
+ MLIRAffineAnalysis
+ MLIRAnalysis
+ MLIRCastInterfaces
+ MLIRDialect
+ MLIROptLib
+ MLIRParser
+ MLIRPass
+ MLIRTransforms
+ MLIRTransformUtils
+ MLIRSupport
+ MLIRIR
+ TosaFuserRegistration
+
+ # TODO: Remove when registerAllGPUToLLVMIRTranslations is no longer
+ # registered directly in tosa-fuser-opt.cpp.
+ MLIRToLLVMIRTranslationRegistration
+ )
+
+# Exclude from libMLIR.so because this has static options intended for
+# opt-like tools only.
+add_mlir_library(MLIRTosaFuserOptMain
+ tosa-fuser-opt.cpp
+
+ EXCLUDE_FROM_LIBMLIR
+
+ LINK_LIBS PUBLIC
+ ${LIBS}
+ )
+
+add_mlir_tool(tosa-fuser-opt
+ tosa-fuser-opt.cpp
+
+ DEPENDS
+ ${LIBS}
+ SUPPORT_PLUGINS
+ )
+target_link_libraries(tosa-fuser-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(tosa-fuser-opt)
+
+mlir_check_all_link_libraries(tosa-fuser-opt)
+export_executable_symbols_for_plugins(tosa-fuser-opt)
diff --git a/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp b/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp
new file mode 100644
index 00000000000000..95a59f44039699
--- /dev/null
+++ b/mlir/tools/tosa-fuser-opt/tosa-fuser-opt.cpp
@@ -0,0 +1,321 @@
+//===- tosa-fuser-opt.cpp - MLIR Optimizer Driver -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Main entry function for tosa-fuser-opt for when built as standalone binary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Config/mlir-config.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Registration/Pipelines.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Target/LLVMIR/Dialect/All.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+using namespace mlir;
+
+// Defined in the test directory, no public header.
+namespace mlir {
+void registerCloneTestPasses();
+void registerConvertToTargetEnvPass();
+void registerLazyLoadingTestPasses();
+void registerLoopLikeInterfaceTestPasses();
+void registerPassManagerTestPass();
+void registerPrintSpirvAvailabilityPass();
+void registerRegionTestPasses();
+void registerShapeFunctionTestPasses();
+void registerSideEffectTestPasses();
+void registerSliceAnalysisTestPass();
+void registerSymbolTestPasses();
+void registerTestAffineAccessAnalysisPass();
+void registerTestAffineDataCopyPass();
+void registerTestAffineLoopUnswitchingPass();
+void registerTestAffineReifyValueBoundsPass();
+void registerTestAffineWalk();
+void registerTestBytecodeRoundtripPasses();
+void registerTestDecomposeAffineOpPass();
+void registerTestFunc();
+void registerTestGpuLoweringPasses();
+void registerTestGpuMemoryPromotionPass();
+void registerTestLoopPermutationPass();
+void registerTestMatchers();
+void registerTestOperationEqualPass();
+void registerTestPreserveUseListOrders();
+void registerTestPrintDefUsePass();
+void registerTestPrintInvalidPass();
+void registerTestPrintNestingPass();
+void registerTestReducer();
+void registerTestSpirvEntryPointABIPass();
+void registerTestSpirvModuleCombinerPass();
+void registerTestTraitsPass();
+void registerTosaTestQuantUtilAPIPass();
+void registerVectorizerTestPass();
+
+namespace test {
+void registerCommutativityUtils();
+void registerConvertCallOpPass();
+void registerConvertFuncOpPass();
+void registerInliner();
+void registerMemRefBoundCheck();
+void registerPatternsTestPass();
+void registerSimpleParametricTilingPass();
+void registerTestAffineLoopParametricTilingPass();
+void registerTestAliasAnalysisPass();
+void registerTestArithEmulateWideIntPass();
+void registerTestBuiltinAttributeInterfaces();
+void registerTestBuiltinDistinctAttributes();
+void registerTestCallGraphPass();
+void registerTestCfAssertPass();
+void registerTestCFGLoopInfoPass();
+void registerTestComposeSubView();
+void registerTestCompositePass();
+void registerTestConstantFold();
+void registerTestControlFlowSink();
+void registerTestDataLayoutPropagation();
+void registerTestDataLayoutQuery();
+void registerTestDeadCodeAnalysisPass();
+void registerTestDecomposeCallGraphTypes();
+void registerTestDiagnosticsPass();
+void registerTestDiagnosticsMetadataPass();
+void registerTestDominancePass();
+void registerTestDynamicPipelinePass();
+void registerTestEmulateNarrowTypePass();
+void registerTestExpandMathPass();
+void registerTestFooAnalysisPass();
+void registerTestComposeSubView();
+void registerTestMultiBuffering();
+void registerTestIRVisitorsPass();
+void registerTestGenericIRVisitorsPass();
+void registerTestInterfaces();
+void registerTestIRVisitorsPass();
+void registerTestLastModifiedPass();
+void registerTestLinalgDecomposeOps();
+void registerTestLinalgDropUnitDims();
+void registerTestLinalgElementwiseFusion();
+void registerTestLinalgGreedyFusion();
+void registerTestLinalgRankReduceContractionOps();
+void registerTestLinalgTransforms();
+void registerTestLivenessAnalysisPass();
+void registerTestLivenessPass();
+void registerTestLoopFusion();
+void registerTestLoopMappingPass();
+void registerTestLoopUnrollingPass();
+void registerTestLowerToArmNeon();
+void registerTestLowerToArmSME();
+void registerTestLowerToLLVM();
+void registerTestMakeIsolatedFromAbovePass();
+void registerTestMatchReductionPass();
+void registerTestMathAlgebraicSimplificationPass();
+void registerTestMathPolynomialApproximationPass();
+void registerTestMathToVCIXPass();
+void registerTestMemRefDependenceCheck();
+void registerTestMemRefStrideCalculation();
+void registerTestMeshReshardingSpmdizationPass();
+void registerTestMeshSimplificationsPass();
+void registerTestMultiBuffering();
+void registerTestNextAccessPass();
+void registerTestNVGPULowerings();
+void registerTestOneToNTypeConversionPass();
+void registerTestOpaqueLoc();
+void registerTestOpLoweringPasses();
+void registerTestPadFusion();
+void registerTestRecursiveTypesPass();
+void registerTestSCFUpliftWhileToFor();
+void registerTestSCFUtilsPass();
+void registerTestSCFWhileOpBuilderPass();
+void registerTestSCFWrapInZeroTripCheckPasses();
+void registerTestShapeMappingPass();
+void registerTestSliceAnalysisPass();
+void registerTestSPIRVFuncSignatureConversion();
+void registerTestSPIRVVectorUnrolling();
+void registerTestTensorCopyInsertionPass();
+void registerTestTensorTransforms();
+void registerTestTopologicalSortAnalysisPass();
+void registerTestTransformDialectEraseSchedulePass();
+void registerTestVectorLowerings();
+void registerTestVectorReductionToSPIRVDotProd();
+void registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+void registerTestDialectConversionPasses();
+void registerTestPDLByteCodePass();
+void registerTestPDLLPasses();
+#endif
+} // namespace test
+} // namespace mlir
+
+namespace test {
+void registerTestDialect(DialectRegistry &);
+void registerTestDynDialect(DialectRegistry &);
+void registerTestTilingInterfaceTransformDialectExtension(DialectRegistry &);
+void registerTestTransformDialectExtension(DialectRegistry &);
+} // namespace test
+
+#ifdef MLIR_INCLUDE_TESTS
+void registerTestPasses() {
+ registerCloneTestPasses();
+ registerConvertToTargetEnvPass();
+ registerLazyLoadingTestPasses();
+ registerLoopLikeInterfaceTestPasses();
+ registerPassManagerTestPass();
+ registerPrintSpirvAvailabilityPass();
+ registerRegionTestPasses();
+ registerShapeFunctionTestPasses();
+ registerSideEffectTestPasses();
+ registerSliceAnalysisTestPass();
+ registerSymbolTestPasses();
+ registerTestAffineAccessAnalysisPass();
+ registerTestAffineDataCopyPass();
+ registerTestAffineLoopUnswitchingPass();
+ registerTestAffineReifyValueBoundsPass();
+ registerTestAffineWalk();
+ registerTestBytecodeRoundtripPasses();
+ registerTestDecomposeAffineOpPass();
+ registerTestFunc();
+ registerTestGpuLoweringPasses();
+ registerTestGpuMemoryPromotionPass();
+ registerTestLoopPermutationPass();
+ registerTestMatchers();
+ registerTestOperationEqualPass();
+ registerTestPreserveUseListOrders();
+ registerTestPrintDefUsePass();
+ registerTestPrintInvalidPass();
+ registerTestPrintNestingPass();
+ registerTestReducer();
+ registerTestSpirvEntryPointABIPass();
+ registerTestSpirvModuleCombinerPass();
+ registerTestTraitsPass();
+ registerTosaTestQuantUtilAPIPass();
+ registerVectorizerTestPass();
+
+ mlir::test::registerCommutativityUtils();
+ mlir::test::registerConvertCallOpPass();
+ mlir::test::registerConvertFuncOpPass();
+ mlir::test::registerInliner();
+ mlir::test::registerMemRefBoundCheck();
+ mlir::test::registerPatternsTestPass();
+ mlir::test::registerSimpleParametricTilingPass();
+ mlir::test::registerTestAffineLoopParametricTilingPass();
+ mlir::test::registerTestAliasAnalysisPass();
+ mlir::test::registerTestArithEmulateWideIntPass();
+ mlir::test::registerTestBuiltinAttributeInterfaces();
+ mlir::test::registerTestBuiltinDistinctAttributes();
+ mlir::test::registerTestCallGraphPass();
+ mlir::test::registerTestCfAssertPass();
+ mlir::test::registerTestCFGLoopInfoPass();
+ mlir::test::registerTestComposeSubView();
+ mlir::test::registerTestCompositePass();
+ mlir::test::registerTestConstantFold();
+ mlir::test::registerTestControlFlowSink();
+ mlir::test::registerTestDataLayoutPropagation();
+ mlir::test::registerTestDataLayoutQuery();
+ mlir::test::registerTestDeadCodeAnalysisPass();
+ mlir::test::registerTestDecomposeCallGraphTypes();
+ mlir::test::registerTestDiagnosticsPass();
+ mlir::test::registerTestDiagnosticsMetadataPass();
+ mlir::test::registerTestDominancePass();
+ mlir::test::registerTestDynamicPipelinePass();
+ mlir::test::registerTestEmulateNarrowTypePass();
+ mlir::test::registerTestExpandMathPass();
+ mlir::test::registerTestFooAnalysisPass();
+ mlir::test::registerTestComposeSubView();
+ mlir::test::registerTestMultiBuffering();
+ mlir::test::registerTestIRVisitorsPass();
+ mlir::test::registerTestGenericIRVisitorsPass();
+ mlir::test::registerTestInterfaces();
+ mlir::test::registerTestIRVisitorsPass();
+ mlir::test::registerTestLastModifiedPass();
+ mlir::test::registerTestLinalgDecomposeOps();
+ mlir::test::registerTestLinalgDropUnitDims();
+ mlir::test::registerTestLinalgElementwiseFusion();
+ mlir::test::registerTestLinalgGreedyFusion();
+ mlir::test::registerTestLinalgRankReduceContractionOps();
+ mlir::test::registerTestLinalgTransforms();
+ mlir::test::registerTestLivenessAnalysisPass();
+ mlir::test::registerTestLivenessPass();
+ mlir::test::registerTestLoopFusion();
+ mlir::test::registerTestLoopMappingPass();
+ mlir::test::registerTestLoopUnrollingPass();
+ mlir::test::registerTestLowerToArmNeon();
+ mlir::test::registerTestLowerToArmSME();
+ mlir::test::registerTestLowerToLLVM();
+ mlir::test::registerTestMakeIsolatedFromAbovePass();
+ mlir::test::registerTestMatchReductionPass();
+ mlir::test::registerTestMathAlgebraicSimplificationPass();
+ mlir::test::registerTestMathPolynomialApproximationPass();
+ mlir::test::registerTestMathToVCIXPass();
+ mlir::test::registerTestMemRefDependenceCheck();
+ mlir::test::registerTestMemRefStrideCalculation();
+ mlir::test::registerTestMeshReshardingSpmdizationPass();
+ mlir::test::registerTestMeshSimplificationsPass();
+ mlir::test::registerTestMultiBuffering();
+ mlir::test::registerTestNextAccessPass();
+ mlir::test::registerTestNVGPULowerings();
+ mlir::test::registerTestOneToNTypeConversionPass();
+ mlir::test::registerTestOpaqueLoc();
+ mlir::test::registerTestOpLoweringPasses();
+ mlir::test::registerTestPadFusion();
+ mlir::test::registerTestRecursiveTypesPass();
+ mlir::test::registerTestSCFUpliftWhileToFor();
+ mlir::test::registerTestSCFUtilsPass();
+ mlir::test::registerTestSCFWhileOpBuilderPass();
+ mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
+ mlir::test::registerTestShapeMappingPass();
+ mlir::test::registerTestSliceAnalysisPass();
+ mlir::test::registerTestSPIRVFuncSignatureConversion();
+ mlir::test::registerTestSPIRVVectorUnrolling();
+ mlir::test::registerTestTensorCopyInsertionPass();
+ mlir::test::registerTestTensorTransforms();
+ mlir::test::registerTestTopologicalSortAnalysisPass();
+ mlir::test::registerTestTransformDialectEraseSchedulePass();
+ mlir::test::registerTestVectorLowerings();
+ mlir::test::registerTestVectorReductionToSPIRVDotProd();
+ mlir::test::registerTestWrittenToPass();
+#if MLIR_ENABLE_PDL_IN_PATTERNMATCH
+ mlir::test::registerTestDialectConversionPasses();
+ mlir::test::registerTestPDLByteCodePass();
+ mlir::test::registerTestPDLLPasses();
+#endif
+}
+#endif
+
+int main(int argc, char **argv) {
+ registerAllPasses();
+#ifdef MLIR_INCLUDE_TESTS
+ registerTestPasses();
+#endif
+ DialectRegistry registry;
+ registerAllDialects(registry);
+ registerAllExtensions(registry);
+ registerTosaFuserPipeline();
+
+ // TODO: Remove this and the corresponding MLIRToLLVMIRTranslationRegistration
+ // cmake dependency when a safe dialect interface registration mechanism is
+ // implemented, see D157703 (and corresponding note on the declaration).
+ registerAllGPUToLLVMIRTranslations(registry);
+
+#ifdef MLIR_INCLUDE_TESTS
+ ::test::registerTestDialect(registry);
+ ::test::registerTestTransformDialectExtension(registry);
+ ::test::registerTestTilingInterfaceTransformDialectExtension(registry);
+ ::test::registerTestDynDialect(registry);
+#endif
+ return mlir::asMainReturnCode(mlir::MlirOptMain(
+ argc, argv, "MLIR modular optimizer driver\n", registry));
+}
>From b484e035c7937f5eb5b34c7efa3cd80684e57c41 Mon Sep 17 00:00:00 2001
From: Akshar S Ramesh <aksharctt at gmail.com>
Date: Thu, 5 Sep 2024 10:10:23 +0530
Subject: [PATCH 2/2] Fusion code; Few tests
---
mlir/include/mlir/Transforms/Passes.h | 1 +
mlir/include/mlir/Transforms/Passes.td | 6 +
mlir/lib/Registration/Pipelines.cpp | 3 +
mlir/lib/Transforms/CMakeLists.txt | 3 +
mlir/lib/Transforms/TosaAffineFusion.cpp | 186 ++++++
tests/PAF_tosa_add_sub.mlir | 309 ++++++++++
tests/PAF_tosa_reduce_max_min.mlir | 425 +++++++++++++
tests/PAF_tosa_reduce_max_min_diff_axis.mlir | 589 +++++++++++++++++++
tests/tosa_add_sub.mlir | 5 +
tests/tosa_reduce_max_min.mlir | 5 +
tests/tosa_reduce_max_min_diff_axis.mlir | 5 +
11 files changed, 1537 insertions(+)
create mode 100644 mlir/lib/Transforms/TosaAffineFusion.cpp
create mode 100644 tests/PAF_tosa_add_sub.mlir
create mode 100644 tests/PAF_tosa_reduce_max_min.mlir
create mode 100644 tests/PAF_tosa_reduce_max_min_diff_axis.mlir
create mode 100644 tests/tosa_add_sub.mlir
create mode 100644 tests/tosa_reduce_max_min.mlir
create mode 100644 tests/tosa_reduce_max_min_diff_axis.mlir
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 8e4a43c3f24586..19aa4960955618 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -138,6 +138,7 @@ std::unique_ptr<Pass> createCompositeFixedPointPass(
std::string name, llvm::function_ref<void(OpPassManager &)> populateFunc,
int maxIterations = 10);
+std::unique_ptr<Pass> createTosaAffineFusionPass();
//===----------------------------------------------------------------------===//
// Registration
//===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 000d9f697618e6..95e2cd8959483a 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -577,4 +577,10 @@ def CompositeFixedPointPass : Pass<"composite-fixed-point-pass"> {
];
}
+def TosaAffineFusion : Pass<"tosa-affine-fusion"> {
+ let summary = "Fuse Loops lowered from tosa";
+ let constructor = "mlir::createTosaAffineFusionPass()";
+ let dependentDialects = ["affine::AffineDialect"];
+}
+
#endif // MLIR_TRANSFORMS_PASSES
diff --git a/mlir/lib/Registration/Pipelines.cpp b/mlir/lib/Registration/Pipelines.cpp
index 43095b154e0ef9..e2687cc95c0616 100644
--- a/mlir/lib/Registration/Pipelines.cpp
+++ b/mlir/lib/Registration/Pipelines.cpp
@@ -37,8 +37,11 @@ void mlir::createTosaFuserPipeline(OpPassManager &pm, const TosaFuserPipelineOpt
pm.addPass(bufferization::createEmptyTensorEliminationPass());
pm.addNestedPass<func::FuncOp>(bufferization::createEmptyTensorToAllocTensorPass());
pm.addPass(bufferization::createOneShotBufferizePass());
+ pm.addPass(func::createFuncBufferizePass());
pm.addPass(createCanonicalizerPass());
pm.addPass(createConvertLinalgToAffineLoopsPass());
+
+ pm.addPass(createTosaAffineFusionPass());
}
static void tosaFuser3(OpPassManager &pm, const TosaFuserPipelineOptions &options) {
diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt
index 90c0298fb5e46a..4519619f2652a6 100644
--- a/mlir/lib/Transforms/CMakeLists.txt
+++ b/mlir/lib/Transforms/CMakeLists.txt
@@ -20,6 +20,7 @@ add_mlir_library(MLIRTransforms
SymbolPrivatize.cpp
TopologicalSort.cpp
ViewOpGraph.cpp
+ TosaAffineFusion.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms
@@ -29,6 +30,8 @@ add_mlir_library(MLIRTransforms
LINK_LIBS PUBLIC
MLIRAnalysis
+ MLIRAffineDialect
+ MLIRFuncDialect
MLIRCopyOpInterface
MLIRFunctionInterfaces
MLIRLoopLikeInterface
diff --git a/mlir/lib/Transforms/TosaAffineFusion.cpp b/mlir/lib/Transforms/TosaAffineFusion.cpp
new file mode 100644
index 00000000000000..81963e5eeba859
--- /dev/null
+++ b/mlir/lib/Transforms/TosaAffineFusion.cpp
@@ -0,0 +1,186 @@
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/Dialect/Affine/Analysis/Utils.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/LoopUtils.h"
+#include "mlir/Dialect/Affine/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_TOSAAFFINEFUSION
+#include "mlir/Transforms/Passes.h.inc"
+}
+
+#define DEBUG_TYPE "tosa-affine-fusion"
+
+using namespace mlir;
+using namespace mlir::affine;
+
+namespace {
+class TosaAffineFusion : public mlir::impl::TosaAffineFusionBase<TosaAffineFusion> {
+
+public :
+ TosaAffineFusion() = default;
+ void runOnOperation() override;
+ bool checkFusibility(AffineForOp *dstLoop, AffineForOp *srcLoop);
+ void moveIntermediateOps(AffineForOp *dstLoop, AffineForOp *srcLoop);
+ void fuseSiblingLoops(AffineForOp *dstLoop, AffineForOp *srcLoop);
+ bool useInsideLoop(Operation *user, AffineForOp *srcLoop);
+ void fuseLoopsInBlock(Block *block);
+};
+
+bool TosaAffineFusion::checkFusibility(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+ if (dstLoop->getOperation() == srcLoop->getOperation()) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Same Loop\n";
+ return false;
+ }
+
+ if (dstLoop->getOperation()->getParentOp() != srcLoop->getOperation()->getParentOp()) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Parent is not same\n";
+ return false;
+ }
+
+ if (dstLoop->getConstantLowerBound() != srcLoop->getConstantLowerBound()) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Lower Bound is not same\n";
+ return false;
+ }
+
+ if (dstLoop->getConstantUpperBound() != srcLoop->getConstantUpperBound()) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Upper Bound is not same\n";
+ return false;
+ }
+
+ if (dstLoop->getStepAsInt() != srcLoop->getStepAsInt()) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+ return false;
+ }
+
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] SUCCESS\n";
+ return true;
+}
+
+bool TosaAffineFusion::useInsideLoop(Operation *user, AffineForOp *srcLoop) {
+ while (!isa<func::FuncOp>(user->getParentOp())) {
+ auto *parentOp = user->getParentOp();
+ if (user->getParentOp() == srcLoop->getOperation())
+ return true;
+ user = parentOp;
+ }
+ return false;
+}
+
+void TosaAffineFusion::moveIntermediateOps(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+ auto *block = dstLoop->getOperation()->getBlock();
+ bool dstLoopFound = false;
+ for (auto &op : block->getOperations()) {
+ if (&op == dstLoop->getOperation()) {
+ dstLoopFound = true;
+ continue;
+ }
+ if (!dstLoopFound)
+ continue;
+ if (&op == srcLoop->getOperation())
+ break;
+ for (auto *user : op.getUsers())
+ if (useInsideLoop(user, srcLoop))
+ op.moveBefore(dstLoop->getOperation());
+ }
+}
+
+void TosaAffineFusion::fuseSiblingLoops(AffineForOp *dstLoop, AffineForOp *srcLoop) {
+ IRMapping map;
+ map.map(srcLoop->getInductionVar(), dstLoop->getInductionVar());
+ OpBuilder builder(*dstLoop);
+ builder.setInsertionPoint(dstLoop->getBody()->getTerminator());
+
+ for (auto &op : srcLoop->getBody()->getOperations()) {
+ if (&op == srcLoop->getBody()->getTerminator())
+ continue;
+ builder.clone(op, map);
+ }
+}
+
+void TosaAffineFusion::fuseLoopsInBlock(Block *block) {
+ auto affineFors = block->getOps<AffineForOp>();
+ SmallVector<AffineForOp, 4> siblingAffineFors{affineFors.begin(), affineFors.end()};
+
+ for (auto dstLoop : siblingAffineFors) {
+ if (!dstLoop.getOperation()) {
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 1 - DstLoop refernce dropped\n";
+ continue;
+ }
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] DstLoop -> \n";
+ dstLoop.dump();
+ if (dstLoop->getParentOp() == nullptr) {
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped\n";
+ continue;
+ }
+ for (auto srcLoop : siblingAffineFors) {
+ if (!srcLoop.getOperation()) {
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 1 - SrcLoop refernce dropped\n";
+ continue;
+ }
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] SrcLoop -> \n";
+ srcLoop.dump();
+ if (srcLoop->getParentOp() == nullptr) {
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped\n";
+ continue;
+ }
+ if (!checkFusibility(&dstLoop, &srcLoop))
+ continue;
+
+ llvm::errs()<<"[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE\n";
+
+ moveIntermediateOps(&dstLoop, &srcLoop);
+
+ fuseSiblingLoops(&dstLoop, &srcLoop);
+
+ srcLoop->dropAllReferences();
+ srcLoop->remove();
+
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] New FUSED DSTLoop\n";
+ dstLoop.dump();
+ }
+
+ for (Region ®ion : dstLoop->getRegions()) {
+ for (Block &block : region.getBlocks()) {
+ auto affineFors = block.getOps<AffineForOp>();
+ if (!affineFors.empty() && !llvm::hasSingleElement(affineFors)) {
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+
+ fuseLoopsInBlock(&block);
+ }
+ }
+ }
+ }
+ llvm::errs()<<"[CHECKFUSIBILITY LOG] Step is not same\n";
+}
+
+void TosaAffineFusion::runOnOperation() {
+ getOperation()->walk([&](Operation *op) {
+ for (Region ®ion : op->getRegions()) {
+ for (Block &block : region.getBlocks()) {
+ auto affineFors = block.getOps<AffineForOp>();
+ if (!affineFors.empty() && !llvm::hasSingleElement(affineFors)) {
+ fuseLoopsInBlock(&block);
+ }
+ }
+ }
+ });
+}
+
+} // end of namespace
+
+std::unique_ptr<Pass> mlir::createTosaAffineFusionPass() {
+ return std::make_unique<TosaAffineFusion>();
+}
\ No newline at end of file
diff --git a/tests/PAF_tosa_add_sub.mlir b/tests/PAF_tosa_add_sub.mlir
new file mode 100644
index 00000000000000..b5c05d74c7c55c
--- /dev/null
+++ b/tests/PAF_tosa_add_sub.mlir
@@ -0,0 +1,309 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+ %0 = tensor.empty() : tensor<2x32xf32>
+ %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ %2 = tensor.empty() : tensor<2x32xf32>
+ %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+ %0 = tensor.empty() : tensor<2x32xf32>
+ %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ %2 = tensor.empty() : tensor<2x32xf32>
+ %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+ }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+ %0 = bufferization.alloc_tensor() : tensor<2x32xf32>
+ %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%0 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ %2 = bufferization.alloc_tensor() : tensor<2x32xf32>
+ %3 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<2x32xf32>, tensor<2x32xf32>) outs(%2 : tensor<2x32xf32>) {
+ ^bb0(%in: f32, %in_0: f32, %out: f32):
+ %4 = arith.addf %in, %in_0 : f32
+ linalg.yield %4 : f32
+ } -> tensor<2x32xf32>
+ return %1, %3 : tensor<2x32xf32>, tensor<2x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+ %0 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %1 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %2 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %3 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%3, %2 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %6 = arith.addf %in, %in_1 : f32
+ linalg.yield %6 : f32
+ }
+ %4 = bufferization.to_tensor %alloc : memref<2x32xf32>
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_0 : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %6 = arith.addf %in, %in_1 : f32
+ linalg.yield %6 : f32
+ }
+ %5 = bufferization.to_tensor %alloc_0 : memref<2x32xf32>
+ return %4, %5 : tensor<2x32xf32>, tensor<2x32xf32>
+ }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+ %0 = bufferization.to_tensor %arg1 : memref<2x32xf32>
+ %1 = bufferization.to_tensor %arg0 : memref<2x32xf32>
+ %2 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %3 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %4 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %5 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%5, %4 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %10 = arith.addf %in, %in_1 : f32
+ linalg.yield %10 : f32
+ }
+ %6 = bufferization.to_tensor %alloc : memref<2x32xf32>
+ %7 = bufferization.to_memref %6 : memref<2x32xf32>
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%3, %2 : memref<2x32xf32, strided<[?, ?], offset: ?>>, memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_0 : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %10 = arith.addf %in, %in_1 : f32
+ linalg.yield %10 : f32
+ }
+ %8 = bufferization.to_tensor %alloc_0 : memref<2x32xf32>
+ %9 = bufferization.to_memref %8 : memref<2x32xf32>
+ return %7, %9 : memref<2x32xf32>, memref<2x32xf32>
+ }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+#map = affine_map<(d0, d1) -> (d0, d1)>
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<2x32xf32>, memref<2x32xf32>) outs(%alloc : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %0 = arith.addf %in, %in_1 : f32
+ linalg.yield %0 : f32
+ }
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<2x32xf32>, memref<2x32xf32>) outs(%alloc_0 : memref<2x32xf32>) {
+ ^bb0(%in: f32, %in_1: f32, %out: f32):
+ %0 = arith.addf %in, %in_1 : f32
+ linalg.yield %0 : f32
+ }
+ return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+ }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+ }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ }
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %5 = arith.addf %3, %4 : f32
+ affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.addf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg1: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.addf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %5 = arith.addf %3, %4 : f32
+ affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+ }
+}
+
+
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<2x32xf32>, memref<2x32xf32>) {
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %2 = arith.addf %0, %1 : f32
+ affine.store %2, %alloc[%arg2, %arg3] : memref<2x32xf32>
+ %3 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %5 = arith.addf %3, %4 : f32
+ affine.store %5, %alloc_0[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ return %alloc, %alloc_0 : memref<2x32xf32>, memref<2x32xf32>
+ }
+}
+
diff --git a/tests/PAF_tosa_reduce_max_min.mlir b/tests/PAF_tosa_reduce_max_min.mlir
new file mode 100644
index 00000000000000..3325128cab1953
--- /dev/null
+++ b/tests/PAF_tosa_reduce_max_min.mlir
@@ -0,0 +1,425 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+ %0 = tensor.empty() : tensor<32xf32>
+ %cst = arith.constant -3.40282347E+38 : f32
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ %2 = tensor.empty() : tensor<32xf32>
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+module {
+ func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+ %0 = tensor.empty() : tensor<32xf32>
+ %cst = arith.constant -3.40282347E+38 : f32
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ %2 = tensor.empty() : tensor<32xf32>
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %0 = bufferization.alloc_tensor() : tensor<32xf32>
+ %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x32xf32>) outs(%1 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ %2 = bufferization.alloc_tensor() : tensor<32xf32>
+ %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<32xf32>) -> tensor<32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x32xf32>) outs(%3 : tensor<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0, 1]] output_shape [1, 32] : tensor<32xf32> into tensor<1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x32xf32>, tensor<1x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+module {
+ func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+ %0 = bufferization.to_memref %arg1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %1 = bufferization.to_memref %arg0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<32xf32>)
+ linalg.reduce ins(%1 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %2 = bufferization.to_tensor %expand_shape : memref<1x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<32xf32>)
+ linalg.reduce ins(%0 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_1 : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %3 = bufferization.to_tensor %expand_shape_2 : memref<1x32xf32>
+ return %2, %3 : tensor<1x32xf32>, tensor<1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+ %0 = bufferization.to_tensor %arg1 : memref<2x32xf32>
+ %1 = bufferization.to_tensor %arg0 : memref<2x32xf32>
+ %2 = bufferization.to_memref %0 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %3 = bufferization.to_memref %1 : memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<32xf32>)
+ linalg.reduce ins(%3 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %8 = arith.maximumf %in, %init : f32
+ linalg.yield %8 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %4 = bufferization.to_tensor %expand_shape : memref<1x32xf32>
+ %5 = bufferization.to_memref %4 : memref<1x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<32xf32>)
+ linalg.reduce ins(%2 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_1 : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %8 = arith.minimumf %in, %init : f32
+ linalg.yield %8 : f32
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %6 = bufferization.to_tensor %expand_shape_2 : memref<1x32xf32>
+ %7 = bufferization.to_memref %6 : memref<1x32xf32>
+ return %5, %7 : memref<1x32xf32>, memref<1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %cast = memref.cast %arg1 : memref<2x32xf32> to memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %cast_1 = memref.cast %arg0 : memref<2x32xf32> to memref<2x32xf32, strided<[?, ?], offset: ?>>
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc : memref<32xf32>)
+ linalg.reduce ins(%cast_1 : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %0 = arith.maximumf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc_2 : memref<32xf32>)
+ linalg.reduce ins(%cast : memref<2x32xf32, strided<[?, ?], offset: ?>>) outs(%alloc_2 : memref<32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %0 = arith.minimumf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ %expand_shape_3 = memref.expand_shape %alloc_2 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ return %expand_shape, %expand_shape_3 : memref<1x32xf32>, memref<1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ affine.for %arg2 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+ }
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+ }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ }
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg3] : memref<32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg1: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0) -> (d0)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+ }
+}
+
+
+module {
+ func.func @test_add_0d(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>) -> (memref<1x32xf32>, memref<1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<32xf32>
+ affine.for %arg2 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2] : memref<32xf32>
+ affine.store %cst_0, %alloc_1[%arg2] : memref<32xf32>
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3] : memref<2x32xf32>
+ %1 = affine.load %alloc[%arg3] : memref<32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3] : memref<32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3] : memref<2x32xf32>
+ %4 = affine.load %alloc_1[%arg3] : memref<32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg3] : memref<32xf32>
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0, 1]] output_shape [1, 32] : memref<32xf32> into memref<1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x32xf32>, memref<1x32xf32>
+ }
+}
+
diff --git a/tests/PAF_tosa_reduce_max_min_diff_axis.mlir b/tests/PAF_tosa_reduce_max_min_diff_axis.mlir
new file mode 100644
index 00000000000000..3e92ea17e38da5
--- /dev/null
+++ b/tests/PAF_tosa_reduce_max_min_diff_axis.mlir
@@ -0,0 +1,589 @@
+// -----// IR Dump After TosaToLinalg (tosa-to-linalg) //----- //
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+ %0 = tensor.empty() : tensor<4x32xf32>
+ %cst = arith.constant -3.40282347E+38 : f32
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+ %2 = tensor.empty() : tensor<2x32xf32>
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
+
+// -----// IR Dump After EmptyTensorElimination (eliminate-empty-tensors) //----- //
+module {
+ func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+ %0 = tensor.empty() : tensor<4x32xf32>
+ %cst = arith.constant -3.40282347E+38 : f32
+ %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+ %2 = tensor.empty() : tensor<2x32xf32>
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %0 = bufferization.alloc_tensor() : tensor<4x32xf32>
+ %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<4x32xf32>) -> tensor<4x32xf32>
+ %reduced = linalg.reduce ins(%arg0 : tensor<2x4x32xf32>) outs(%1 : tensor<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded = tensor.expand_shape %reduced [[0, 1], [2]] output_shape [1, 4, 32] : tensor<4x32xf32> into tensor<1x4x32xf32>
+ %2 = bufferization.alloc_tensor() : tensor<2x32xf32>
+ %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2x32xf32>) -> tensor<2x32xf32>
+ %reduced_1 = linalg.reduce ins(%arg1 : tensor<2x4x32xf32>) outs(%3 : tensor<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expanded_2 = tensor.expand_shape %reduced_1 [[0], [1, 2]] output_shape [2, 1, 32] : tensor<2x32xf32> into tensor<2x1x32xf32>
+ return %expanded, %expanded_2 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
+
+// -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
+module {
+ func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+ %0 = bufferization.to_memref %arg1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %1 = bufferization.to_memref %arg0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<4x32xf32>)
+ linalg.reduce ins(%1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %4 = arith.maximumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %2 = bufferization.to_tensor %expand_shape : memref<1x4x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<2x32xf32>)
+ linalg.reduce ins(%0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_1 : memref<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %4 = arith.minimumf %in, %init : f32
+ linalg.yield %4 : f32
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ %3 = bufferization.to_tensor %expand_shape_2 : memref<2x1x32xf32>
+ return %2, %3 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After FuncBufferize (func-bufferize) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+ %0 = bufferization.to_tensor %arg1 : memref<2x4x32xf32>
+ %1 = bufferization.to_tensor %arg0 : memref<2x4x32xf32>
+ %2 = bufferization.to_memref %0 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %3 = bufferization.to_memref %1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %cst = arith.constant 3.40282347E+38 : f32
+ %cst_0 = arith.constant -3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc : memref<4x32xf32>)
+ linalg.reduce ins(%3 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %8 = arith.maximumf %in, %init : f32
+ linalg.yield %8 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %4 = bufferization.to_tensor %expand_shape : memref<1x4x32xf32>
+ %5 = bufferization.to_memref %4 : memref<1x4x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc_1 : memref<2x32xf32>)
+ linalg.reduce ins(%2 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_1 : memref<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %8 = arith.minimumf %in, %init : f32
+ linalg.yield %8 : f32
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ %6 = bufferization.to_tensor %expand_shape_2 : memref<2x1x32xf32>
+ %7 = bufferization.to_memref %6 : memref<2x1x32xf32>
+ return %5, %7 : memref<1x4x32xf32>, memref<2x1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After Canonicalizer (canonicalize) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %cast = memref.cast %arg1 : memref<2x4x32xf32> to memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %cast_1 = memref.cast %arg0 : memref<2x4x32xf32> to memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ linalg.fill ins(%cst : f32) outs(%alloc : memref<4x32xf32>)
+ linalg.reduce ins(%cast_1 : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc : memref<4x32xf32>) dimensions = [0]
+ (%in: f32, %init: f32) {
+ %0 = arith.maximumf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ linalg.fill ins(%cst_0 : f32) outs(%alloc_2 : memref<2x32xf32>)
+ linalg.reduce ins(%cast : memref<2x4x32xf32, strided<[?, ?, ?], offset: ?>>) outs(%alloc_2 : memref<2x32xf32>) dimensions = [1]
+ (%in: f32, %init: f32) {
+ %0 = arith.minimumf %in, %init : f32
+ linalg.yield %0 : f32
+ }
+ %expand_shape_3 = memref.expand_shape %alloc_2 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ return %expand_shape, %expand_shape_3 : memref<1x4x32xf32>, memref<2x1x32xf32>
+ }
+}
+
+
+// -----// IR Dump After ConvertLinalgToAffineLoopsPass (convert-linalg-to-affine-loops) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+ }
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+ }
+}
+
+
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ }
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ }
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %2 = arith.minimumf %0, %1 : f32
+ affine.store %2, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] SUCCESS
+[FUSELOOPSINBLOCK LOG] DSTLoop SRCLoop FUSABLE
+[CHECKFUSIBILITY LOG] New FUSED DSTLoop
+affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+^bb0(%arg0: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+}
+[CHECKFUSIBILITY LOG] Upper Bound is not same
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+}
+[CHECKFUSIBILITY LOG] Same Loop
+[FUSELOOPSINBLOCK LOG] SrcLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg1: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - SrcLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg1: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg1: index):
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[FUSELOOPSINBLOCK LOG] DstLoop ->
+"affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (2)>}> ({
+^bb0(%arg0: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (4)>}> ({
+ ^bb0(%arg1: index):
+ "affine.for"() <{lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array<i32: 0, 0, 0>, step = 1 : index, upperBoundMap = affine_map<() -> (32)>}> ({
+ ^bb0(%arg2: index):
+ %0 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %1 = "affine.load"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ %2 = "arith.minimumf"(<<NULL VALUE>>, <<NULL VALUE>>) <{fastmath = #arith.fastmath<none>}> : (<<NULL TYPE>>, <<NULL TYPE>>) -> f32
+ "affine.store"(<<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>, <<NULL VALUE>>) <{map = affine_map<(d0, d1) -> (d0, d1)>}> : (<<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>, <<NULL TYPE>>) -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+ }) : () -> ()
+ "affine.yield"() : () -> ()
+}) : () -> ()
+[FUSELOOPSINBLOCK LOG] 2 - DstLoop refernce dropped
+[CHECKFUSIBILITY LOG] Step is not same
+// -----// IR Dump After TosaAffineFusion (tosa-affine-fusion) //----- //
+module {
+ func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+ }
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+ }
+}
+
+
+module {
+ func.func @test_add_0d(%arg0: memref<2x4x32xf32>, %arg1: memref<2x4x32xf32>) -> (memref<1x4x32xf32>, memref<2x1x32xf32>) {
+ %cst = arith.constant -3.40282347E+38 : f32
+ %cst_0 = arith.constant 3.40282347E+38 : f32
+ %alloc = memref.alloc() {alignment = 64 : i64} : memref<4x32xf32>
+ affine.for %arg2 = 0 to 4 {
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst, %alloc[%arg2, %arg3] : memref<4x32xf32>
+ }
+ }
+ %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<2x32xf32>
+ affine.for %arg2 = 0 to 2 {
+ affine.for %arg3 = 0 to 4 {
+ affine.for %arg4 = 0 to 32 {
+ %0 = affine.load %arg0[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %1 = affine.load %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %2 = arith.maximumf %0, %1 : f32
+ affine.store %2, %alloc[%arg3, %arg4] : memref<4x32xf32>
+ %3 = affine.load %arg1[%arg2, %arg3, %arg4] : memref<2x4x32xf32>
+ %4 = affine.load %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ %5 = arith.minimumf %3, %4 : f32
+ affine.store %5, %alloc_1[%arg2, %arg4] : memref<2x32xf32>
+ }
+ }
+ affine.for %arg3 = 0 to 32 {
+ affine.store %cst_0, %alloc_1[%arg2, %arg3] : memref<2x32xf32>
+ }
+ }
+ %expand_shape = memref.expand_shape %alloc [[0, 1], [2]] output_shape [1, 4, 32] : memref<4x32xf32> into memref<1x4x32xf32>
+ %expand_shape_2 = memref.expand_shape %alloc_1 [[0], [1, 2]] output_shape [2, 1, 32] : memref<2x32xf32> into memref<2x1x32xf32>
+ return %expand_shape, %expand_shape_2 : memref<1x4x32xf32>, memref<2x1x32xf32>
+ }
+}
+
diff --git a/tests/tosa_add_sub.mlir b/tests/tosa_add_sub.mlir
new file mode 100644
index 00000000000000..49c4ed9d9684da
--- /dev/null
+++ b/tests/tosa_add_sub.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<2x32xf32>, tensor<2x32xf32>) {
+ %0 = tosa.add %arg0, %arg1 : (tensor<2x32xf32>, tensor<2x32xf32>) -> tensor<2x32xf32>
+ %1 = tosa.add %arg0, %arg1 : (tensor<2x32xf32>, tensor<2x32xf32>) -> tensor<2x32xf32>
+ return %0, %1 : tensor<2x32xf32>, tensor<2x32xf32>
+}
\ No newline at end of file
diff --git a/tests/tosa_reduce_max_min.mlir b/tests/tosa_reduce_max_min.mlir
new file mode 100644
index 00000000000000..ef5c6fa78aab5e
--- /dev/null
+++ b/tests/tosa_reduce_max_min.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x32xf32>, %arg1: tensor<2x32xf32>) -> (tensor<1x32xf32>, tensor<1x32xf32>) {
+ %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<2x32xf32>) -> tensor<1x32xf32>
+ %1 = tosa.reduce_min %arg1 {axis = 0 : i32} : (tensor<2x32xf32>) -> tensor<1x32xf32>
+ return %0, %1 : tensor<1x32xf32>, tensor<1x32xf32>
+}
\ No newline at end of file
diff --git a/tests/tosa_reduce_max_min_diff_axis.mlir b/tests/tosa_reduce_max_min_diff_axis.mlir
new file mode 100644
index 00000000000000..d7a97a414b6595
--- /dev/null
+++ b/tests/tosa_reduce_max_min_diff_axis.mlir
@@ -0,0 +1,5 @@
+func.func @test_add_0d(%arg0: tensor<2x4x32xf32>, %arg1: tensor<2x4x32xf32>) -> (tensor<1x4x32xf32>, tensor<2x1x32xf32>) {
+ %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<2x4x32xf32>) -> tensor<1x4x32xf32>
+ %1 = tosa.reduce_min %arg1 {axis = 1 : i32} : (tensor<2x4x32xf32>) -> tensor<2x1x32xf32>
+ return %0, %1 : tensor<1x4x32xf32>, tensor<2x1x32xf32>
+}
\ No newline at end of file
More information about the Mlir-commits
mailing list