[Mlir-commits] [mlir] cc47037 - [mlir][AMDGPU] Add emulation pass for atomics on AMDGPU targets
Krzysztof Drewniak
llvmlistbot at llvm.org
Wed May 3 14:18:54 PDT 2023
Author: Krzysztof Drewniak
Date: 2023-05-03T21:18:48Z
New Revision: cc4703745ffa398b66f985b483cb8b61eb2ed425
URL: https://github.com/llvm/llvm-project/commit/cc4703745ffa398b66f985b483cb8b61eb2ed425
DIFF: https://github.com/llvm/llvm-project/commit/cc4703745ffa398b66f985b483cb8b61eb2ed425.diff
LOG: [mlir][AMDGPU] Add emulation pass for atomics on AMDGPU targets
Not all AMDGPU targets support all atomic operations. For example,
there are not atomic floating-point adds on the gfx10 series. Add a
pass to emulate these operations using a compare-and-swap loop, by
analogy to the generic atomicrmw rewrite in MemrefToLLVM.
This pass is named generally, as in the future we may have a
memref-to-amdgpu that translates constructs like atomicrmw fmax (which
doesn't generally exist in LLVM) to the relevant intrinsics, which may
themselves require emulation.
Since the AMDGPU dialect now has a pass that operates on it, the
dialect's directory structure is reorganized to match other similarly
complex dialects.
The pass should be run before amdgpu-to-rocdl if desired.
This commit also adds f64 support to atomic_fmax.
Depends on D148722
Reviewed By: nirvedhmeshram
Differential Revision: https://reviews.llvm.org/D148724
Added:
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt
mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt
mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp
mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
Modified:
mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
mlir/include/mlir/InitAllDialects.h
mlir/include/mlir/InitAllPasses.h
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
mlir/lib/Dialect/AMDGPU/CMakeLists.txt
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
Removed:
mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h
mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h
mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp
################################################################################
diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
index feec7332fbfcc..12a1a5e227d67 100644
--- a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
+++ b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
@@ -8,7 +8,7 @@
#ifndef MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
#define MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
-#include "mlir/Conversion/AMDGPUToROCDL/Chipset.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include <memory>
#include <string>
diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
index ed074c205a551..9f57627c321fb 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
@@ -1,12 +1,2 @@
-add_mlir_dialect(AMDGPU amdgpu)
-add_mlir_doc(AMDGPU AMDGPU Dialects/ -gen-dialect-doc)
-
-set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
-mlir_tablegen(AMDGPUEnums.h.inc -gen-enum-decls)
-mlir_tablegen(AMDGPUEnums.cpp.inc -gen-enum-defs)
-add_public_tablegen_target(MLIRAMDGPUEnumsGen)
-
-set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
-mlir_tablegen(AMDGPUAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=amdgpu)
-mlir_tablegen(AMDGPUAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=amdgpu)
-add_public_tablegen_target(MLIRAMDGPUAttributesIncGen)
+add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
similarity index 99%
rename from mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
rename to mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 63c15ac537d15..bec2371a6a705 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -221,7 +221,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
def AMDGPU_RawBufferAtomicFmaxOp :
AMDGPU_Op<"raw_buffer_atomic_fmax", [AllElementTypesMatch<["value", "memref"]>,
AttrSizedOperandSegments]>,
- Arguments<(ins F32:$value,
+ Arguments<(ins AnyTypeOf<[F32, F64]>:$value,
Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
Variadic<I32>:$indices,
DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
similarity index 68%
rename from mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h
rename to mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index 7a7b86695ccf2..90b7936def019 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -11,22 +11,22 @@
//
//===----------------------------------------------------------------------===//
-#ifndef MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
-#define MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
+#ifndef MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_
+#define MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc"
-#include "mlir/Dialect/AMDGPU/AMDGPUEnums.h.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc"
#define GET_ATTRDEF_CLASSES
-#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.h.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc"
#define GET_OP_CLASSES
-#include "mlir/Dialect/AMDGPU/AMDGPU.h.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPU.h.inc"
-#endif // MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
+#endif // MLIR_DIALECT_AMDGPU_IR_AMDGPUDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt
new file mode 100644
index 0000000000000..ed074c205a551
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_dialect(AMDGPU amdgpu)
+add_mlir_doc(AMDGPU AMDGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
+mlir_tablegen(AMDGPUEnums.h.inc -gen-enum-decls)
+mlir_tablegen(AMDGPUEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRAMDGPUEnumsGen)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
+mlir_tablegen(AMDGPUAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=amdgpu)
+mlir_tablegen(AMDGPUAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=amdgpu)
+add_public_tablegen_target(MLIRAMDGPUAttributesIncGen)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..8880989f4fed4
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name AMDGPU)
+add_public_tablegen_target(MLIRAMDGPUTransformsIncGen)
+add_dependencies(mlir-headers MLIRAMDGPUTransformsIncGen)
+
+add_mlir_doc(Passes AMDGPUPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
new file mode 100644
index 0000000000000..8dd5ff1a4b198
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -0,0 +1,33 @@
+//===-- Passes.h - AMDGPU transformation pass declarations --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the transformation passes for the TOSA Dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_
+#define MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_
+
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+class ConversionTarget;
+namespace amdgpu {
+
+#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+
+void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target,
+ RewritePatternSet &patterns,
+ Chipset chipset);
+} // namespace amdgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
new file mode 100644
index 0000000000000..e6b27aa842dfc
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -0,0 +1,33 @@
+//===-- Passes.td - AMDGPU pass declarations ----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the passes for the AMDGPU Dialect in MLIR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
+#define MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
+ let summary = "Emulate atomic operations on chipsets that do not support them";
+ let description = [{
+ This pass rewrites any AMDGPU-specific atomic operation that is not supported
+ on the given `chipset` into a compare-and-swap loop.
+ }];
+ let dependentDialects = [
+ "cf::ControlFlowDialect",
+ "arith::ArithDialect",
+ ];
+ let options = [Option<"chipset", "chipset", "std::string",
+ /*default=*/"\"gfx000\"",
+ "Chipset that these operations will run on">];
+}
+
+#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
similarity index 88%
rename from mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h
rename to mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
index 82808821b6e95..5031dc4856173 100644
--- a/mlir/include/mlir/Conversion/AMDGPUToROCDL/Chipset.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Utils/Chipset.h
@@ -5,8 +5,8 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-#ifndef MLIR_CONVERSION_AMDGPUTOROCDL_CHIPSET_H_
-#define MLIR_CONVERSION_AMDGPUTOROCDL_CHIPSET_H_
+#ifndef MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_
+#define MLIR_DIALECT_AMDGPU_UTILS_CHIPSET_H_
#include "mlir/Support/LogicalResult.h"
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 560b683328dc5..e0f154824d8eb 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -14,7 +14,7 @@
#ifndef MLIR_INITALLDIALECTS_H_
#define MLIR_INITALLDIALECTS_H_
-#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/AMX/AMXDialect.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.h"
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 216fc12323718..a42983d7fe036 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -15,6 +15,7 @@
#define MLIR_INITALLPASSES_H_
#include "mlir/Conversion/Passes.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
#include "mlir/Dialect/Affine/Passes.h"
#include "mlir/Dialect/Arith/Transforms/Passes.h"
#include "mlir/Dialect/Async/Passes.h"
@@ -56,6 +57,7 @@ inline void registerAllPasses() {
// Dialect passes
affine::registerAffinePasses();
+ amdgpu::registerAMDGPUPasses();
registerAsyncPasses();
arith::registerArithPasses();
bufferization::registerBufferizationPasses();
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 015b7164421a1..eeac499f49196 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -10,7 +10,7 @@
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
-#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Pass/Pass.h"
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
index f574b4c55dd83..43cbc0732516d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
@@ -1,6 +1,5 @@
add_mlir_conversion_library(MLIRAMDGPUToROCDL
AMDGPUToROCDL.cpp
- Chipset.cpp
ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AMDGPUToROCDL
@@ -16,6 +15,7 @@ add_mlir_conversion_library(MLIRAMDGPUToROCDL
MLIRLLVMDialect
MLIRROCDLDialect
MLIRAMDGPUDialect
+ MLIRAMDGPUUtils
MLIRPass
MLIRTransforms
)
diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
index f33061b2d87cf..31167e6af908b 100644
--- a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
@@ -1 +1,3 @@
add_subdirectory(IR)
+add_subdirectory(Transforms)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index b66a4b29fcc2a..cafa6a8fa2dd0 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -30,16 +30,16 @@
using namespace mlir;
using namespace mlir::amdgpu;
-#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc"
void AMDGPUDialect::initialize() {
addOperations<
#define GET_OP_LIST
-#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
>();
addAttributes<
#define GET_ATTRDEF_LIST
-#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc"
>();
}
@@ -282,10 +282,10 @@ LogicalResult MFMAOp::verify() {
return success();
}
-#include "mlir/Dialect/AMDGPU/AMDGPUEnums.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc"
#define GET_ATTRDEF_CLASSES
-#include "mlir/Dialect/AMDGPU/AMDGPUAttributes.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc"
#define GET_OP_CLASSES
-#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc"
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..e11b6cc88bf22
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -0,0 +1,19 @@
+add_mlir_dialect_library(MLIRAMDGPUTransforms
+ EmulateAtomics.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
+
+ DEPENDS
+ MLIRAMDGPUTransformsIncGen
+
+ LINK_LIBS PUBLIC
+ MLIRAMDGPUDialect
+ MLIRAMDGPUUtils
+ MLIRArithDialect
+ MLIRControlFlowDialect
+ MLIRIR
+ MLIRPass
+ MLIRTransforms
+ MLIRTransformUtils
+ )
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
new file mode 100644
index 0000000000000..0c69cdc027912
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -0,0 +1,189 @@
+//===- EmulateAtomics.cpp - Emulate unsupported AMDGPU atomics ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::amdgpu {
+#define GEN_PASS_DEF_AMDGPUEMULATEATOMICSPASS
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+} // namespace mlir::amdgpu
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+namespace {
+struct AmdgpuEmulateAtomicsPass
+ : public amdgpu::impl::AmdgpuEmulateAtomicsPassBase<
+ AmdgpuEmulateAtomicsPass> {
+ using AmdgpuEmulateAtomicsPassBase<
+ AmdgpuEmulateAtomicsPass>::AmdgpuEmulateAtomicsPassBase;
+ void runOnOperation() override;
+};
+
+template <typename AtomicOp, typename ArithOp>
+struct RawBufferAtomicByCasPattern : public OpConversionPattern<AtomicOp> {
+ using OpConversionPattern<AtomicOp>::OpConversionPattern;
+ using Adaptor = typename AtomicOp::Adaptor;
+
+ LogicalResult
+ matchAndRewrite(AtomicOp atomicOp, Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override;
+};
+} // namespace
+
+namespace {
+enum class DataArgAction : unsigned char {
+ Duplicate,
+ Drop,
+};
+} // namespace
+
+// Fix up the fact that, when we're migrating from a general bugffer atomic
+// to a load or to a CAS, the number of openrands, and thus the number of
+// entries needed in operand_segment_sizes, needs to change. We use this method
+// because we'd like to preserve unknown attributes on the atomic instead of
+// discarding them.
+static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
+ SmallVectorImpl<NamedAttribute> &newAttrs,
+ DataArgAction action) {
+ newAttrs.reserve(attrs.size());
+ for (NamedAttribute attr : attrs) {
+ if (attr.getName().getValue() != "operand_segment_sizes") {
+ newAttrs.push_back(attr);
+ continue;
+ }
+ auto segmentAttr = attr.getValue().cast<DenseI32ArrayAttr>();
+ MLIRContext *context = segmentAttr.getContext();
+ DenseI32ArrayAttr newSegments;
+ switch (action) {
+ case DataArgAction::Drop:
+ newSegments = DenseI32ArrayAttr::get(
+ context, segmentAttr.asArrayRef().drop_front());
+ break;
+ case DataArgAction::Duplicate: {
+ SmallVector<int32_t> newVals;
+ ArrayRef<int32_t> oldVals = segmentAttr.asArrayRef();
+ newVals.push_back(oldVals[0]);
+ newVals.append(oldVals.begin(), oldVals.end());
+ newSegments = DenseI32ArrayAttr::get(context, newVals);
+ break;
+ }
+ }
+ newAttrs.push_back(NamedAttribute(attr.getName(), newSegments));
+ }
+}
+
+template <typename AtomicOp, typename ArithOp>
+LogicalResult RawBufferAtomicByCasPattern<AtomicOp, ArithOp>::matchAndRewrite(
+ AtomicOp atomicOp, Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const {
+ Location loc = atomicOp.getLoc();
+
+ ArrayRef<NamedAttribute> origAttrs = atomicOp->getAttrs();
+ ValueRange operands = adaptor.getOperands();
+ Value data = operands.take_front()[0];
+ ValueRange invariantArgs = operands.drop_front();
+ Type dataType = data.getType();
+
+ SmallVector<NamedAttribute> loadAttrs;
+ patchOperandSegmentSizes(origAttrs, loadAttrs, DataArgAction::Drop);
+ Value initialLoad =
+ rewriter.create<RawBufferLoadOp>(loc, dataType, invariantArgs, loadAttrs);
+ Block *currentBlock = rewriter.getInsertionBlock();
+ Block *afterAtomic =
+ rewriter.splitBlock(currentBlock, rewriter.getInsertionPoint());
+ Block *loopBlock = rewriter.createBlock(afterAtomic, {dataType}, {loc});
+
+ rewriter.setInsertionPointToEnd(currentBlock);
+ rewriter.create<cf::BranchOp>(loc, loopBlock, initialLoad);
+
+ rewriter.setInsertionPointToEnd(loopBlock);
+ Value prevLoad = loopBlock->getArgument(0);
+ Value operated = rewriter.create<ArithOp>(loc, data, prevLoad);
+
+ SmallVector<NamedAttribute> cmpswapAttrs;
+ patchOperandSegmentSizes(origAttrs, cmpswapAttrs, DataArgAction::Duplicate);
+ SmallVector<Value> cmpswapArgs = {operated, prevLoad};
+ cmpswapArgs.append(invariantArgs.begin(), invariantArgs.end());
+ Value atomicRes = rewriter.create<RawBufferAtomicCmpswapOp>(
+ loc, dataType, cmpswapArgs, cmpswapAttrs);
+
+ // We care about exact bitwise equality here, so do some bitcasts.
+ // These will fold away during lowering to the ROCDL dialect, where
+ // an int->float bitcast is introduced to account for the fact that cmpswap
+ // only takes integer arguments.
+
+ Value prevLoadForCompare = prevLoad;
+ Value atomicResForCompare = atomicRes;
+ if (auto floatDataTy = dataType.dyn_cast<FloatType>()) {
+ Type equivInt = rewriter.getIntegerType(floatDataTy.getWidth());
+ prevLoadForCompare =
+ rewriter.create<arith::BitcastOp>(loc, equivInt, prevLoad);
+ atomicResForCompare =
+ rewriter.create<arith::BitcastOp>(loc, equivInt, atomicRes);
+ }
+ Value canLeave = rewriter.create<arith::CmpIOp>(
+ loc, arith::CmpIPredicate::eq, atomicResForCompare, prevLoadForCompare);
+ rewriter.create<cf::CondBranchOp>(loc, canLeave, afterAtomic, ValueRange{},
+ loopBlock, atomicRes);
+ rewriter.replaceOp(atomicOp, {});
+ return success();
+}
+
+void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
+ ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset) {
+ // gfx10 has no atomic adds.
+ if (chipset.majorVersion == 10 || chipset.majorVersion < 9 ||
+ (chipset.majorVersion == 9 && chipset.minorVersion < 0x08)) {
+ target.addIllegalOp<RawBufferAtomicFaddOp>();
+ }
+ // gfx9 has no to a very limited support for floating-point min and max.
+ if (chipset.majorVersion == 9) {
+ if (chipset.minorVersion >= 0x0a) {
+ // gfx90a supports f64 max (and min, but we don't have a min wrapper right
+ // now) but all other types need to be emulated.
+ target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
+ [](RawBufferAtomicFmaxOp op) -> bool {
+ return op.getValue().getType().isF64();
+ });
+ } else {
+ target.addIllegalOp<RawBufferAtomicFmaxOp>();
+ }
+ }
+ patterns
+ .add<RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
+ RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaxFOp>>(
+ patterns.getContext());
+}
+
+void AmdgpuEmulateAtomicsPass::runOnOperation() {
+ Operation *op = getOperation();
+ FailureOr<Chipset> maybeChipset = Chipset::parse(chipset);
+ if (failed(maybeChipset)) {
+ emitError(op->getLoc(), "Invalid chipset name: " + chipset);
+ return signalPassFailure();
+ }
+
+ MLIRContext &ctx = getContext();
+ ConversionTarget target(ctx);
+ RewritePatternSet patterns(&ctx);
+ target.markUnknownOpDynamicallyLegal(
+ [](Operation *op) -> bool { return true; });
+
+ populateAmdgpuEmulateAtomicsPatterns(target, patterns, *maybeChipset);
+ if (failed(applyPartialConversion(op, target, std::move(patterns))))
+ return signalPassFailure();
+}
diff --git a/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt
new file mode 100644
index 0000000000000..cb67399b7386d
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Utils/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_mlir_dialect_library(MLIRAMDGPUUtils
+ Chipset.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Utils
+
+ LINK_LIBS PUBLIC
+ MLIRAMDGPUDialect
+ MLIRSupport
+ )
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp b/mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp
similarity index 94%
rename from mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp
rename to mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp
index 99087e1c24209..4f075c71c947d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/Chipset.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Utils/Chipset.cpp
@@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Conversion/AMDGPUToROCDL/Chipset.h"
+#include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/StringRef.h"
diff --git a/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
new file mode 100644
index 0000000000000..95355e13b8ecb
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/amdgpu-emulate-atomics.mlir
@@ -0,0 +1,52 @@
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx90a %s | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt -split-input-file -amdgpu-emulate-atomics=chipset=gfx1030 %s | FileCheck %s --check-prefixes=CHECK,GFX10
+
+// -----
+
+func.func @atomic_fmax(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
+// CHECK: func @atomic_fmax
+// CHECK-SAME: ([[val:%.+]]: f32, [[buffer:%.+]]: memref<?xf32>, [[idx:%.+]]: i32)
+// CHECK: gpu.printf "Begin\0A"
+// GFX10: amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} [[val]] -> [[buffer]][[[idx]]]
+// GFX9: [[ld:%.+]] = amdgpu.raw_buffer_load {foo, indexOffset = 4 : i32} [[buffer]][[[idx]]]
+// GFX9: cf.br [[loop:\^.+]]([[ld]] : f32)
+// GFX9: [[loop]]([[arg:%.+]]: f32):
+// GFX9: [[operated:%.+]] = arith.maxf [[val]], [[arg]]
+// GFX9: [[atomicRes:%.+]] = amdgpu.raw_buffer_atomic_cmpswap {foo, indexOffset = 4 : i32} [[operated]], [[arg]] -> [[buffer]][[[idx]]]
+// GFX9: [[argCast:%.+]] = arith.bitcast [[arg]] : f32 to i32
+// GFX9: [[resCast:%.+]] = arith.bitcast [[atomicRes]] : f32 to i32
+// GFX9: [[test:%.+]] = arith.cmpi eq, [[resCast]], [[argCast]]
+// GFX9: cf.cond_br [[test]], [[post:\^.+]], [[loop]]([[atomicRes]] : f32)
+// GFX9: [[post]]:
+// CHECK-NEXT: gpu.printf "End\0A"
+ gpu.printf "Begin\n"
+ amdgpu.raw_buffer_atomic_fmax {foo, indexOffset = 4 : i32} %val -> %buffer[%idx] : f32 -> memref<?xf32>, i32
+ gpu.printf "End\n"
+ func.return
+}
+
+// -----
+
+func.func @atomic_fmax_f64(%val: f64, %buffer: memref<?xf64>, %idx: i32) {
+// CHECK: func @atomic_fmax_f64
+// CHECK-SAME: ([[val:%.+]]: f64, [[buffer:%.+]]: memref<?xf64>, [[idx:%.+]]: i32)
+// CHECK: gpu.printf "Begin\0A"
+// GFX9: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// GFX10: amdgpu.raw_buffer_atomic_fmax [[val]] -> [[buffer]][[[idx]]]
+// CHECK-NEXT: gpu.printf "End\0A"
+ gpu.printf "Begin\n"
+ amdgpu.raw_buffer_atomic_fmax %val -> %buffer[%idx] : f64 -> memref<?xf64>, i32
+ gpu.printf "End\n"
+ func.return
+}
+
+// -----
+
+func.func @atomic_fadd(%val: f32, %buffer: memref<?xf32>, %idx: i32) {
+// CHECK: func @atomic_fadd
+// GFX9: amdgpu.raw_buffer_atomic_fadd
+// GFX10: amdgpu.raw_buffer_load
+// GFX10: amdgpu.raw_buffer_atomic_cmpswap
+ amdgpu.raw_buffer_atomic_fadd %val -> %buffer[%idx] : f32 -> memref<?xf32>, i32
+ func.return
+}
More information about the Mlir-commits
mailing list