[Mlir-commits] [mlir] [mlir][nvgpu] add `nvgpu.rcp` op (PR #100965)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Jul 29 02:05:13 PDT 2024
https://github.com/Observer007 updated https://github.com/llvm/llvm-project/pull/100965
>From 0749a595895feb7507baf3a9a2df093f7284849e Mon Sep 17 00:00:00 2001
From: jingzec <jingzec at nvidia.com>
Date: Fri, 26 Jul 2024 00:48:01 -0700
Subject: [PATCH 1/4] add rcp approxe op
---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 13 +++++++
.../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 38 ++++++++++++++++++-
.../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 16 ++++++++
3 files changed, 66 insertions(+), 1 deletion(-)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index dda8f31e688fe..3501c5af16d8c 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -20,6 +20,7 @@
#ifndef NVGPU
#define NVGPU
+include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/AttrTypeBase.td"
include "mlir/IR/OpBase.td"
@@ -802,4 +803,16 @@ def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulat
let hasVerifier = 1;
}
+def NVGPU_RcpApproxOp : NVGPU_Op<"rcp_approx", [
+ Pure, SameOperandsAndResultType
+]> {
+ let summary = "A wrapper of nvvm rcp.approx.ftz.f";
+ let description = [{
+ F32 vector reciprocal calculation using `nvvm.rcp.approx.ftz.f`.
+ The input and output are both F32 vector with same shape.
+ }];
+ let arguments = (ins VectorOf<[F32]>:$in);
+ let results = (outs VectorOf<[F32]>:$out);
+ let assemblyFormat = "$in attr-dict `:` type($out)";
+}
#endif // NVGPU
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 11d29754aa760..b7ea4aaeb7d8c 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -11,6 +11,7 @@
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Conversion/LLVMCommon/VectorPattern.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -1666,6 +1667,41 @@ struct NVGPUTmaPrefetchOpLowering
}
};
+struct NVGPURcpApproxOpLowering
+ : public ConvertOpToLLVMPattern<nvgpu::RcpApproxOp> {
+ using ConvertOpToLLVMPattern<nvgpu::RcpApproxOp>::ConvertOpToLLVMPattern;
+ LogicalResult
+ matchAndRewrite(nvgpu::RcpApproxOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+ auto i64Ty = b.getI64Type();
+ auto f32Ty = b.getF32Type();
+ VectorType inTy = op.getIn().getType();
+ // apply rcp.approx.ftz.f on each element in vector.
+ auto convert1DVec = [&](Type llvm1DVectorTy, Value inVec) {
+ Value ret1DVec = b.create<LLVM::UndefOp>(llvm1DVectorTy);
+ int numElems = llvm::cast<VectorType>(llvm1DVectorTy).getNumElements();
+ for (int i = 0; i < numElems; i++) {
+ Value idx = b.create<LLVM::ConstantOp>(i64Ty, b.getI64IntegerAttr(i));
+ Value elem = b.create<LLVM::ExtractElementOp>(inVec, idx);
+ Value dst = b.create<NVVM::RcpApproxFtzF32Op>(f32Ty, elem);
+ ret1DVec = b.create<LLVM::InsertElementOp>(ret1DVec, dst, idx);
+ }
+ return ret1DVec;
+ };
+ if (inTy.getRank() == 1) {
+ rewriter.replaceOp(op, convert1DVec(inTy, adaptor.getIn()));
+ return success();
+ }
+ return LLVM::detail::handleMultidimensionalVectors(
+ op.getOperation(), adaptor.getOperands(), *(this->getTypeConverter()),
+ [&](Type llvm1DVectorTy, ValueRange operands) -> Value {
+ OpAdaptor adaptor(operands);
+ return convert1DVec(llvm1DVectorTy, adaptor.getIn());
+ },
+ rewriter);
+ }
+};
} // namespace
void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -1688,5 +1724,5 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
- NVGPUMmaSparseSyncLowering>(converter);
+ NVGPUMmaSparseSyncLowering, NVGPURcpApproxOpLowering>(converter);
}
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 86a552c03a473..95cd7a0892b6c 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1339,3 +1339,19 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}
+
+// CHECK-LABEL: @rcp_approx_ftz_f32
+// CHECK-SAME: %[[IN:.*]]: vector<32x16xf32>
+func.func @rcp_approx_ftz_f32(
+ %in: vector<32x16xf32>) {
+ // CHECK: %[[IN_LLVM:.*]] = builtin.unrealized_conversion_cast %[[IN]] : vector<32x16xf32> to !llvm.array<32 x vector<16xf32>>
+ // CHECK: %[[IN1DVEC:.*]] = llvm.extractvalue %[[IN_LLVM]][0] : !llvm.array<32 x vector<16xf32>>
+ // CHECK: %[[OUT1DVEC:.*]] = llvm.mlir.undef : vector<16xf32>
+ // CHECK: %[[IDX_0:.+]] = llvm.mlir.constant(0 : i64) : i64
+ // CHECK: %[[ELEM_0:.*]] = llvm.extractelement %[[IN1DVEC]][%[[IDX_0]] : i64]
+ // CHECK: %[[ELEM_RCP0:.*]] = nvvm.rcp.approx.ftz.f %[[ELEM_0]] : f32
+ // CHECK: llvm.insertelement %[[ELEM_RCP0]], %[[OUT1DVEC]][%[[IDX_0]] : i64] : vector<16xf32>
+ // CHECK-COUNT-511: nvvm.rcp.approx.ftz.f
+ %out = nvgpu.rcp_approx %in : vector<32x16xf32>
+ return
+}
>From 54d91e81723a4afa27f1267f63bee5f343cbf777 Mon Sep 17 00:00:00 2001
From: jingzec <jingzec at nvidia.com>
Date: Sun, 28 Jul 2024 21:12:31 -0700
Subject: [PATCH 2/4] fix
---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index 19070f6f062a0..aad2ac6f4dd2b 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -17,6 +17,7 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
>From 9d1e603b27810a60b313eb3fbf5c0cdb3e1988f2 Mon Sep 17 00:00:00 2001
From: jingzec <jingzec at nvidia.com>
Date: Mon, 29 Jul 2024 01:58:38 -0700
Subject: [PATCH 3/4] fix comment
---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 37 +++++++++++++++----
.../mlir/Dialect/NVGPU/IR/NVGPUDialect.h | 1 -
.../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 9 ++---
mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp | 17 ++++++++-
.../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 2 +-
mlir/test/Dialect/NVGPU/invalid.mlir | 18 +++++++++
6 files changed, 68 insertions(+), 16 deletions(-)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 3501c5af16d8c..8e0e5c0501c79 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -20,7 +20,6 @@
#ifndef NVGPU
#define NVGPU
-include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/IR/AttrTypeBase.td"
include "mlir/IR/OpBase.td"
@@ -110,10 +109,24 @@ def TensorMapInterleaveKind : I32EnumAttr<"TensorMapInterleaveKind",
let cppNamespace = "::mlir::nvgpu";
}
+def RcpApprox : I32EnumAttrCase<"APPROX", 0, "approx">;
+def RcpRN : I32EnumAttrCase<"RN", 1, "rn">;
+def RcpRZ : I32EnumAttrCase<"RZ", 2, "rz">;
+def RcpRM : I32EnumAttrCase<"RM", 3, "rm">;
+def RcpRP : I32EnumAttrCase<"RP", 4, "rp">;
+def RcpRoundingMode : I32EnumAttr<"RcpRoundingMode", "Rounding mode of rcp",
+ [RcpApprox, RcpRN, RcpRZ, RcpRM, RcpRP]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::nvgpu";
+}
+
def TensorMapSwizzleAttr : EnumAttr<NVGPU_Dialect, TensorMapSwizzleKind, "swizzle">;
def TensorMapL2PromoAttr : EnumAttr<NVGPU_Dialect, TensorMapL2PromoKind, "l2promo">;
def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
+def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
+
+// rcp mode attribute
//===----------------------------------------------------------------------===//
// NVGPU Type Definitions
@@ -803,16 +816,24 @@ def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulat
let hasVerifier = 1;
}
-def NVGPU_RcpApproxOp : NVGPU_Op<"rcp_approx", [
- Pure, SameOperandsAndResultType
-]> {
- let summary = "A wrapper of nvvm rcp.approx.ftz.f";
+def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
+ SameOperandsAndResultShape,
+ PredOpTrait<"in and out have same element type",
+ TCresVTEtIsSameAsOp<0, 0>>]> {
+ let summary = "F32 rcp calculation for vector types using nvvm.rcp.* OP";
let description = [{
- F32 vector reciprocal calculation using `nvvm.rcp.approx.ftz.f`.
+ F32 vector reciprocal calculation using `nvvm.rcp.*`. Currently only
+ `nvvm.approx.ftz.f` is supported.
The input and output are both F32 vector with same shape.
}];
- let arguments = (ins VectorOf<[F32]>:$in);
+ let arguments = (ins VectorOf<[F32]>:$in,
+ DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
+ UnitAttr:$ftz);
let results = (outs VectorOf<[F32]>:$out);
- let assemblyFormat = "$in attr-dict `:` type($out)";
+ let assemblyFormat = [{
+ $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}`
+ attr-dict `:` type($in) `->` type($out)
+ }];
+ let hasVerifier = 1;
}
#endif // NVGPU
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
index aad2ac6f4dd2b..19070f6f062a0 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUDialect.h
@@ -17,7 +17,6 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.h.inc"
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index b7ea4aaeb7d8c..cf984ca3293c0 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1667,11 +1667,10 @@ struct NVGPUTmaPrefetchOpLowering
}
};
-struct NVGPURcpApproxOpLowering
- : public ConvertOpToLLVMPattern<nvgpu::RcpApproxOp> {
- using ConvertOpToLLVMPattern<nvgpu::RcpApproxOp>::ConvertOpToLLVMPattern;
+struct NVGPURcpOpLowering : public ConvertOpToLLVMPattern<nvgpu::RcpOp> {
+ using ConvertOpToLLVMPattern<nvgpu::RcpOp>::ConvertOpToLLVMPattern;
LogicalResult
- matchAndRewrite(nvgpu::RcpApproxOp op, OpAdaptor adaptor,
+ matchAndRewrite(nvgpu::RcpOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
ImplicitLocOpBuilder b(op->getLoc(), rewriter);
auto i64Ty = b.getI64Type();
@@ -1724,5 +1723,5 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
- NVGPUMmaSparseSyncLowering, NVGPURcpApproxOpLowering>(converter);
+ NVGPUMmaSparseSyncLowering, NVGPURcpOpLowering>(converter);
}
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index 26f831f10a4e4..a320e893b59c7 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -10,9 +10,9 @@
//
//===----------------------------------------------------------------------===//
-#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
@@ -644,6 +644,21 @@ LogicalResult WarpgroupMmaInitAccumulatorOp::verify() {
return success();
}
+//===----------------------------------------------------------------------===//
+// RcpOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult RcpOp::verify() {
+ RcpRoundingModeAttr rounding = getRoundingAttr();
+ bool ftz = getFtz();
+ // Currently, only `rcp_approx` and `ftz` is supported.
+ if (rounding.getValue() != RcpRoundingMode::APPROX || !ftz) {
+ return emitOpError() << "has a limitation. " << rounding
+ << " or non-ftz is not supported yet.";
+ }
+ return success();
+}
+
//===----------------------------------------------------------------------===//
// TableGen'd dialect, type, and op definitions
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 95cd7a0892b6c..464e11b79ae75 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1352,6 +1352,6 @@ func.func @rcp_approx_ftz_f32(
// CHECK: %[[ELEM_RCP0:.*]] = nvvm.rcp.approx.ftz.f %[[ELEM_0]] : f32
// CHECK: llvm.insertelement %[[ELEM_RCP0]], %[[OUT1DVEC]][%[[IDX_0]] : i64] : vector<16xf32>
// CHECK-COUNT-511: nvvm.rcp.approx.ftz.f
- %out = nvgpu.rcp_approx %in : vector<32x16xf32>
+ %out = nvgpu.rcp %in {rounding = approx, ftz} : vector<32x16xf32> -> vector<32x16xf32>
return
}
diff --git a/mlir/test/Dialect/NVGPU/invalid.mlir b/mlir/test/Dialect/NVGPU/invalid.mlir
index c3aed35153241..726c245898101 100644
--- a/mlir/test/Dialect/NVGPU/invalid.mlir
+++ b/mlir/test/Dialect/NVGPU/invalid.mlir
@@ -336,3 +336,21 @@ func.func @tma_generate_descriptor_incorrect_last_dim(%desc: !desc, %buffer2: m
nvgpu.tma.async.load %desc[%c0, %c0], %mbarrier[%c0] to %buffer2 : !desc, !mbarrier -> memref<64x128xf32,3>
return
}
+// -----
+
+func.func @rcp_unsupported_rounding_0(%in : vector<16xf32>) {
+ // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rn> or non-ftz is not supported yet.}}
+ %out = nvgpu.rcp %in {rounding = rn, ftz} : vector<16xf32> -> vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_rounding_1(%in : vector<16xf32>) {
+ // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode rz> or non-ftz is not supported yet.}}
+ %out = nvgpu.rcp %in {rounding = rz} : vector<16xf32> -> vector<16xf32>
+}
+// -----
+
+func.func @rcp_unsupported_ftz(%in : vector<16xf32>) {
+ // expected-error @+1 {{'nvgpu.rcp' op has a limitation. #nvgpu<rcp_rounding_mode approx> or non-ftz is not supported yet.}}
+ %out = nvgpu.rcp %in {rounding = approx} : vector<16xf32> -> vector<16xf32>
+}
>From 9f1829d93f178d3fa9745852e16ddeab7a55c3cc Mon Sep 17 00:00:00 2001
From: jingzec <jingzec at nvidia.com>
Date: Mon, 29 Jul 2024 02:04:34 -0700
Subject: [PATCH 4/4] drop useless
---
mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td | 2 --
1 file changed, 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 8e0e5c0501c79..d8caf2818c4f0 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -126,8 +126,6 @@ def TensorMapOOBAttr : EnumAttr<NVGPU_Dialect, TensorMapOOBKind, "oob">;
def TensorMapInterleaveAttr : EnumAttr<NVGPU_Dialect, TensorMapInterleaveKind, "interleave">;
def RcpRoundingModeAttr : EnumAttr<NVGPU_Dialect, RcpRoundingMode, "rcp_rounding_mode">;
-// rcp mode attribute
-
//===----------------------------------------------------------------------===//
// NVGPU Type Definitions
//===----------------------------------------------------------------------===//
More information about the Mlir-commits
mailing list