[llvm-branch-commits] [mlir] [AMDGPU][MLIR] Replace gfx940 and gfx941 with gfx942 in MLIR (PR #125836)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Feb 5 03:03:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-amdgpu
Author: Fabian Ritter (ritter-x2a)
<details>
<summary>Changes</summary>
gfx940 and gfx941 are no longer supported. This is one of a series of
PRs to remove them from the code base.
For SWDEV-512631
---
Full diff: https://github.com/llvm/llvm-project/pull/125836.diff
10 Files Affected:
- (modified) mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td (+1-1)
- (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+4-4)
- (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+11-11)
- (modified) mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp (+1-1)
- (modified) mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp (+1-7)
- (modified) mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir (+1-1)
- (modified) mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir (+1-1)
- (modified) mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir (+1-1)
- (modified) mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir (+1-1)
- (modified) mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp (+8-12)
``````````diff
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 69745addfd748ec..24f541587cba88a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -602,7 +602,7 @@ def AMDGPU_MFMAOp :
order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on).
The negateA, negateB, and negateC flags are only supported for double-precision
- operations on gfx940+.
+ operations on gfx942+.
}];
let assemblyFormat = [{
$sourceA `*` $sourceB `+` $destC
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 974712c581537a9..8b4f7e49f573d68 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -382,11 +382,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k">
def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">;
def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">;
def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">;
-// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a
-// NEG bitfield. See IntrinsicsAMDGPU.td for more info.
+// Note: in gfx942, unlike in gfx90a, the f64 xdlops use the "blgp" argument as
+// a NEG bitfield. See IntrinsicsAMDGPU.td for more info.
def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">;
def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">;
-// New in gfx940.
+// New in gfx942.
def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">;
def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">;
def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">;
@@ -409,7 +409,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">;
def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>;
def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>;
-// 2:4 Sparsity ops (GFX940)
+// 2:4 Sparsity ops (GFX942)
def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">;
def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">;
def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">;
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 51f5d7a161b9030..f67d174d7d7c9a6 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -80,7 +80,7 @@ namespace {
// Define commonly used chipsets versions for convenience.
constexpr Chipset kGfx908 = Chipset(9, 0, 8);
constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
-constexpr Chipset kGfx940 = Chipset(9, 4, 0);
+constexpr Chipset kGfx942 = Chipset(9, 4, 2);
/// Define lowering patterns for raw buffer ops
template <typename GpuOp, typename Intrinsic>
@@ -483,7 +483,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
destElem = destType.getElementType();
if (sourceElem.isF32() && destElem.isF32()) {
- if (mfma.getReducePrecision() && chipset >= kGfx940) {
+ if (mfma.getReducePrecision() && chipset >= kGfx942) {
if (m == 32 && n == 32 && k == 4 && b == 1)
return ROCDL::mfma_f32_32x32x4_xf32::getOperationName();
if (m == 16 && n == 16 && k == 8 && b == 1)
@@ -551,9 +551,9 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
return ROCDL::mfma_i32_32x32x8i8::getOperationName();
if (m == 16 && n == 16 && k == 16 && b == 1)
return ROCDL::mfma_i32_16x16x16i8::getOperationName();
- if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940)
+ if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_32x32x16_i8::getOperationName();
- if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940)
+ if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942)
return ROCDL::mfma_i32_16x16x32_i8::getOperationName();
}
@@ -565,7 +565,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}
if (isa<Float8E5M2FNUZType>(sourceElem) && destElem.isF32() &&
- chipset >= kGfx940) {
+ chipset >= kGfx942) {
// Known to be correct because there are no scalar f8 instructions and
// because a length mismatch will have been caught by the verifier.
Type sourceBElem =
@@ -585,7 +585,7 @@ static std::optional<StringRef> mfmaOpToIntrinsic(MFMAOp mfma,
}
if (isa<Float8E4M3FNUZType>(sourceElem) && destElem.isF32() &&
- chipset >= kGfx940) {
+ chipset >= kGfx942) {
Type sourceBElem =
cast<VectorType>(mfma.getSourceB().getType()).getElementType();
if (m == 16 && n == 16 && k == 32 && b == 1) {
@@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern<MFMAOp> {
return op->emitOpError("MFMA only supported on gfx908+");
uint32_t getBlgpField = static_cast<uint32_t>(op.getBlgp());
if (op.getNegateA() || op.getNegateB() || op.getNegateC()) {
- if (chipset < kGfx940)
- return op.emitOpError("negation unsupported on older than gfx940");
+ if (chipset < kGfx942)
+ return op.emitOpError("negation unsupported on older than gfx942");
getBlgpField |=
op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2);
}
@@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
@@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite(
PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
@@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite(
PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
- if (chipset.majorVersion != 9 || chipset < kGfx940)
+ if (chipset.majorVersion != 9 || chipset < kGfx942)
return rewriter.notifyMatchFailure(
loc, "Fp8 conversion instructions are not available on target "
"architecture and their emulation is not implemented");
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 33370566996eee5..338a2124149a844 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -381,7 +381,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() {
}
bool convertFP8Arithmetic =
- maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0);
+ maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2);
arith::populateArithToAMDGPUConversionPatterns(
patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz,
*maybeChipset);
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
index 77f972e0e589445..7459a6503cddfb4 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp
@@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
}
// gfx9 has no to a very limited support for floating-point min and max.
if (chipset.majorVersion == 9) {
- if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) {
+ if (chipset >= Chipset(9, 0, 0xa)) {
// gfx90a supports f64 max (and min, but we don't have a min wrapper right
// now) but all other types need to be emulated.
target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
@@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
} else {
target.addIllegalOp<RawBufferAtomicFmaxOp>();
}
- if (chipset == Chipset(9, 4, 1)) {
- // gfx941 requires non-CAS atomics to be implemented with CAS loops.
- // The workaround here mirrors HIP and OpenMP.
- target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
- RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
- }
}
patterns.add<
RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
index 7818a525d17b537..a313aaffdf5ccca 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s
// CHECK-LABEL: func @ext_scalar
// CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
index f8a60d37801ebec..52db1421dc3c686 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s
func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>,
%arg2 : vector<16xf32>, %arg3 : vector<4xf32>,
%arg4 : vector<4xf16>, %arg5 : vector<4xi8>,
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
index cd921da2294e13b..07a428566d48852 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -1,5 +1,5 @@
// RUN: mlir-opt --split-input-file %s \
-// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \
+// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \
// RUN: | FileCheck %s
// CHECK-LABEL: func.func @scalar_trunc
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
index bd90facb6154408..da834ff4a111526 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s
+// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s
// CHECK-LABEL: func.func @scalar_ext
// CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ)
diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
index 976ff2e7382edf7..570d56f3c6ff139 100644
--- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
+++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp
@@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) {
EXPECT_EQ(chipset->minorVersion, 0u);
EXPECT_EQ(chipset->steppingVersion, 0xau);
- chipset = Chipset::parse("gfx940");
+ chipset = Chipset::parse("gfx942");
ASSERT_TRUE(succeeded(chipset));
EXPECT_EQ(chipset->majorVersion, 9u);
EXPECT_EQ(chipset->minorVersion, 4u);
- EXPECT_EQ(chipset->steppingVersion, 0u);
+ EXPECT_EQ(chipset->steppingVersion, 2u);
chipset = Chipset::parse("gfx1103");
ASSERT_TRUE(succeeded(chipset));
@@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) {
EXPECT_TRUE(failed(Chipset::parse("navi33")));
EXPECT_TRUE(failed(Chipset::parse("rdna2")));
EXPECT_TRUE(failed(Chipset::parse("sm_80")));
- EXPECT_TRUE(failed(Chipset::parse("GFX940")));
- EXPECT_TRUE(failed(Chipset::parse("Gfx940")));
+ EXPECT_TRUE(failed(Chipset::parse("GFX942")));
+ EXPECT_TRUE(failed(Chipset::parse("Gfx942")));
EXPECT_TRUE(failed(Chipset::parse("gfx9")));
- EXPECT_TRUE(failed(Chipset::parse("gfx_940")));
- EXPECT_TRUE(failed(Chipset::parse("gfx940_")));
+ EXPECT_TRUE(failed(Chipset::parse("gfx_942")));
+ EXPECT_TRUE(failed(Chipset::parse("gfx942_")));
EXPECT_TRUE(failed(Chipset::parse("gfxmeow")));
EXPECT_TRUE(failed(Chipset::parse("gfx1fff")));
}
TEST(ChipsetTest, Comparison) {
- EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0));
- EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2));
+ EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2));
EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0));
EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0));
EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2));
- EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1));
EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2));
- EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0));
EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8));
EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa));
- EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2));
- EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0));
+ EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2));
}
} // namespace
``````````
</details>
https://github.com/llvm/llvm-project/pull/125836
More information about the llvm-branch-commits
mailing list