[Mlir-commits] [mlir] bfa501b - [mlir][AMDGPU] Move to new buffer resource intrinsics
Krzysztof Drewniak
llvmlistbot at llvm.org
Fri Sep 22 12:48:11 PDT 2023
Author: Krzysztof Drewniak
Date: 2023-09-22T19:48:06Z
New Revision: bfa501b8927a3a18466fe9cc20c6d5afa786599f
URL: https://github.com/llvm/llvm-project/commit/bfa501b8927a3a18466fe9cc20c6d5afa786599f
DIFF: https://github.com/llvm/llvm-project/commit/bfa501b8927a3a18466fe9cc20c6d5afa786599f.diff
LOG: [mlir][AMDGPU] Move to new buffer resource intrinsics
The AMDGPU backend now has buffer resource intrinsics that take a ptr
addrspase (8) instead of a vector<4xi32>, improving LLVM's ability to
reason about their memory behavior. This commit moves MLIR to these
new functions.
Reviewed By: jsjodin
Differential Revision: https://reviews.llvm.org/D157053
Added:
Modified:
mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
mlir/test/Dialect/LLVMIR/rocdl.mlir
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index 2ac98769cf2f598..503988abfc090a2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -55,17 +55,34 @@ def LLVM_AnyFloat : Type<
def LLVM_AnyPointer : Type<CPred<"::llvm::isa<::mlir::LLVM::LLVMPointerType>($_self)">,
"LLVM pointer type", "::mlir::LLVM::LLVMPointerType">;
+def LLVM_OpaquePointer : Type<
+ And<[LLVM_AnyPointer.predicate,
+ CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).isOpaque()">]>,
+ "LLVM opaque pointer", "::mlir::LLVM::LLVMPointerType">;
+
// Type constraint accepting LLVM pointer type with an additional constraint
// on the element type.
class LLVM_PointerTo<Type pointee> : Type<
And<[LLVM_AnyPointer.predicate,
- Or<[CPred<"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).isOpaque()">,
+ Or<[LLVM_OpaquePointer.predicate,
SubstLeaves<
"$_self",
"::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getElementType()",
pointee.predicate>]>]>,
"LLVM pointer to " # pointee.summary, "::mlir::LLVM::LLVMPointerType">;
+// Opaque pointer in a given address space.
+class LLVM_OpaquePointerInAddressSpace<int addressSpace> : Type<
+ And<[LLVM_OpaquePointer.predicate,
+ CPred<
+ "::llvm::cast<::mlir::LLVM::LLVMPointerType>($_self).getAddressSpace() == "
+ # addressSpace>]>,
+ "Opaque LLVM pointer in address space " # addressSpace,
+ "::mlir::LLVM::LLVMPointerType"> {
+ let builderCall = "$_builder.getType<::mlir::LLVM::LLVMPointerType>("
+ # addressSpace # ")";
+}
+
// Type constraints accepting LLVM pointer type to integer of a specific width.
class LLVM_IntPtrBase<int width, int addressSpace = 0> : Type<
And<[LLVM_PointerTo<I<width>>.predicate,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 13515cd669570e9..08d36397dc31355 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -71,6 +71,14 @@ class ROCDL_IntrPure1Op<string mnemonic> :
LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
"amdgcn_" # !subst(".", "_", mnemonic), [], [], [Pure], 1>;
+class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
+ list<int> overloadedOperands, list<Trait> traits, int numResults,
+ int requiresAccessGroup = 0, int requiresAliasAnalysis = 0> :
+ LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
+ "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
+ overloadedOperands, traits, numResults, requiresAccessGroup,
+ requiresAliasAnalysis>;
+
//===----------------------------------------------------------------------===//
// ROCDL special register op definitions
//===----------------------------------------------------------------------===//
@@ -262,7 +270,96 @@ def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">
def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
+//===---------------------------------------------------------------------===//
+// Operations on raw buffer resources (stride of 0, bounds checks either off or in
+// raw buffer mode).
+//===---------------------------------------------------------------------===//
+
+def ROCDLBufferRsrc : LLVM_OpaquePointerInAddressSpace<8>;
+
+def ROCDL_MakeBufferRsrcOp :
+ ROCDL_IntrOp<"make.buffer.rsrc", [], [0], [Pure], 1>,
+ Arguments<(ins LLVM_AnyPointer:$base,
+ I16:$stride,
+ I32:$numRecords,
+ I32:$flags)> {
+ let results = (outs ROCDLBufferRsrc:$res);
+ let assemblyFormat = "operands attr-dict `:` type($base) `to` type($res)";
+}
+
+def ROCDL_RawPtrBufferLoadOp :
+ ROCDL_IntrOp<"raw.ptr.buffer.load", [0], [], [], 1, 0, 1> {
+ dag args = (ins Arg<ROCDLBufferRsrc, "", [MemRead]>:$rsrc,
+ I32:$offset,
+ I32:$soffset,
+ I32:$aux);
+ let arguments = !con(args, aliasAttrs);
+ let assemblyFormat = "operands attr-dict `:` type($res)";
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getRes()};
+ }
+ }];
+}
+
+def ROCDL_RawPtrBufferStoreOp :
+ ROCDL_IntrOp<"raw.ptr.buffer.store", [], [0], [], 0, 0, 1> {
+ dag args = (ins LLVM_Type:$vdata,
+ Arg<ROCDLBufferRsrc, "", [MemWrite]>:$rsrc,
+ I32:$offset,
+ I32:$soffset,
+ I32:$aux);
+ let arguments = !con(args, aliasAttrs);
+ let assemblyFormat = "operands attr-dict `:` type($vdata)";
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getRsrc()};
+ }
+ }];
+
+}
+
+def ROCDL_RawPtrBufferAtomicCmpSwap :
+ ROCDL_IntrOp<"raw.ptr.buffer.atomic.cmpswap",
+ [0], [], [AllTypesMatch<["res", "src", "cmp"]>], 1, 0, 1> {
+ dag args = (ins LLVM_Type:$src,
+ LLVM_Type:$cmp,
+ Arg<ROCDLBufferRsrc, "", [MemRead, MemWrite]>:$rsrc,
+ I32:$offset,
+ I32:$soffset,
+ I32:$aux);
+ let arguments = !con(args, aliasAttrs);
+ let assemblyFormat = "operands attr-dict `:` type($res)";
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getRsrc()};
+ }
+ }];
+}
+
+class ROCDL_RawPtrBufferAtomicNoRet<string op> :
+ ROCDL_IntrOp<"raw.ptr.buffer.atomic." # op, [], [0], [], 0, 0, 1> {
+ dag args = (ins LLVM_Type:$vdata,
+ Arg<ROCDLBufferRsrc, "", [MemRead, MemWrite]>:$rsrc,
+ I32:$offset,
+ I32:$soffset,
+ I32:$aux);
+ let arguments = !con(args, aliasAttrs);
+ let assemblyFormat = "operands attr-dict `:` type($vdata)";
+ let extraClassDefinition = [{
+ ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+ return {getRsrc()};
+ }
+ }];
+}
+
+def ROCDL_RawPtrBufferAtomicFmaxOp : ROCDL_RawPtrBufferAtomicNoRet<"fmax">;
+def ROCDL_RawPtrBufferAtomicSmaxOp : ROCDL_RawPtrBufferAtomicNoRet<"smax">;
+def ROCDL_RawPtrBufferAtomicUminOp : ROCDL_RawPtrBufferAtomicNoRet<"umin">;
+// Note: not supported on all architectures
+def ROCDL_RawPtrBufferAtomicFaddOp : ROCDL_RawPtrBufferAtomicNoRet<"fadd">;
+/// LEGACY BUFFER OPERATIONS. DO NOT USE IN NEW CODE. KEPT FOR IR COMPATIBILITY.
//===---------------------------------------------------------------------===//
// Vector buffer load/store intrinsics
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 8d603aa69369de5..eeed04049668420 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -59,7 +59,7 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
MemRefType memrefType = cast<MemRefType>(unconvertedMemref.getType());
if (chipset.majorVersion < 9)
- return gpuOp.emitOpError("Raw buffer ops require GCN or higher");
+ return gpuOp.emitOpError("raw buffer ops require GCN or higher");
Value storeData = adaptor.getODSOperands(0)[0];
if (storeData == memref) // no write component to this op
@@ -82,6 +82,7 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
Type i32 = rewriter.getI32Type();
Type llvmI32 = this->typeConverter->convertType(i32);
+ Type llvmI16 = this->typeConverter->convertType(rewriter.getI16Type());
int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8;
Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth);
@@ -156,41 +157,13 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
if (failed(getStridesAndOffset(memrefType, strides, offset)))
return gpuOp.emitOpError("Can't lower non-stride-offset memrefs");
- // Resource descriptor
- // bits 0-47: base address
- // bits 48-61: stride (0 for raw buffers)
- // bit 62: texture cache coherency (always 0)
- // bit 63: enable swizzles (always off for raw buffers)
- // bits 64-95 (word 2): Number of records, units of stride
- // bits 96-127 (word 3): See below
-
- Type llvm4xI32 = this->typeConverter->convertType(VectorType::get(4, i32));
MemRefDescriptor memrefDescriptor(memref);
- Type llvmI64 = this->typeConverter->convertType(rewriter.getI64Type());
- Value c32I64 = rewriter.create<LLVM::ConstantOp>(
- loc, llvmI64, rewriter.getI64IntegerAttr(32));
-
- Value resource = rewriter.create<LLVM::UndefOp>(loc, llvm4xI32);
Value ptr = memrefDescriptor.alignedPtr(rewriter, loc);
- Value ptrAsInt = rewriter.create<LLVM::PtrToIntOp>(loc, llvmI64, ptr);
- Value lowHalf = rewriter.create<LLVM::TruncOp>(loc, llvmI32, ptrAsInt);
- resource = rewriter.create<LLVM::InsertElementOp>(
- loc, llvm4xI32, resource, lowHalf,
- this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 0));
-
- // Bits 48-63 are used both for the stride of the buffer and (on gfx10) for
- // enabling swizzling. Prevent the high bits of pointers from accidentally
- // setting those flags.
- Value highHalfShifted = rewriter.create<LLVM::TruncOp>(
- loc, llvmI32, rewriter.create<LLVM::LShrOp>(loc, ptrAsInt, c32I64));
- Value highHalfTruncated = rewriter.create<LLVM::AndOp>(
- loc, llvmI32, highHalfShifted,
- createI32Constant(rewriter, loc, 0x0000ffff));
- resource = rewriter.create<LLVM::InsertElementOp>(
- loc, llvm4xI32, resource, highHalfTruncated,
- this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 1));
-
+ // The stride value is always 0 for raw buffers. This also disables
+ // swizling.
+ Value stride = rewriter.createOrFold<LLVM::ConstantOp>(
+ loc, llvmI16, rewriter.getI16IntegerAttr(0));
Value numRecords;
if (memrefType.hasStaticShape()) {
numRecords = createI32Constant(
@@ -209,11 +182,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
numRecords = rewriter.create<LLVM::TruncOp>(loc, llvmI32, maxIndex);
}
- resource = rewriter.create<LLVM::InsertElementOp>(
- loc, llvm4xI32, resource, numRecords,
- this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 2));
- // Final word:
+ // Flag word:
// bits 0-11: dst sel, ignored by these intrinsics
// bits 12-14: data format (ignored, must be nonzero, 7=float)
// bits 15-18: data format (ignored, must be nonzero, 4=32bit)
@@ -227,16 +197,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// bits 28-29: Out of bounds select (0 = structured, 1 = check index, 2 =
// none, 3 = either swizzles or testing against offset field) RDNA only
// bits 30-31: Type (must be 0)
- uint32_t word3 = (7 << 12) | (4 << 15);
+ uint32_t flags = (7 << 12) | (4 << 15);
if (chipset.majorVersion >= 10) {
- word3 |= (1 << 24);
+ flags |= (1 << 24);
uint32_t oob = adaptor.getBoundsCheck() ? 3 : 2;
- word3 |= (oob << 28);
+ flags |= (oob << 28);
}
- Value word3Const = createI32Constant(rewriter, loc, word3);
- resource = rewriter.create<LLVM::InsertElementOp>(
- loc, llvm4xI32, resource, word3Const,
- this->createIndexAttrConstant(rewriter, loc, this->getIndexType(), 3));
+ Value flagsConst = createI32Constant(rewriter, loc, flags);
+ Type rsrcType = LLVM::LLVMPointerType::get(rewriter.getContext(), 8);
+ Value resource = rewriter.createOrFold<ROCDL::MakeBufferRsrcOp>(
+ loc, rsrcType, ptr, stride, numRecords, flagsConst);
args.push_back(resource);
// Indexing (voffset)
@@ -708,16 +678,20 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
});
patterns.add<LDSBarrierOpLowering>(converter);
- patterns.add<
- RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawBufferLoadOp>,
- RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawBufferStoreOp>,
- RawBufferOpLowering<RawBufferAtomicFaddOp, ROCDL::RawBufferAtomicFAddOp>,
- RawBufferOpLowering<RawBufferAtomicFmaxOp, ROCDL::RawBufferAtomicFMaxOp>,
- RawBufferOpLowering<RawBufferAtomicSmaxOp, ROCDL::RawBufferAtomicSMaxOp>,
- RawBufferOpLowering<RawBufferAtomicUminOp, ROCDL::RawBufferAtomicUMinOp>,
- RawBufferOpLowering<RawBufferAtomicCmpswapOp,
- ROCDL::RawBufferAtomicCmpSwap>,
- MFMAOpLowering, WMMAOpLowering>(converter, chipset);
+ patterns
+ .add<RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
+ RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
+ RawBufferOpLowering<RawBufferAtomicFaddOp,
+ ROCDL::RawPtrBufferAtomicFaddOp>,
+ RawBufferOpLowering<RawBufferAtomicFmaxOp,
+ ROCDL::RawPtrBufferAtomicFmaxOp>,
+ RawBufferOpLowering<RawBufferAtomicSmaxOp,
+ ROCDL::RawPtrBufferAtomicSmaxOp>,
+ RawBufferOpLowering<RawBufferAtomicUminOp,
+ ROCDL::RawPtrBufferAtomicUminOp>,
+ RawBufferOpLowering<RawBufferAtomicCmpswapOp,
+ ROCDL::RawPtrBufferAtomicCmpSwap>,
+ MFMAOpLowering, WMMAOpLowering>(converter, chipset);
}
std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index dda8a068857d5ee..76e42791323494c 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -4,19 +4,12 @@
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
- // CHECK: %[[ptr:.*]] = llvm.ptrtoint
- // CHECK: %[[lowHalf:.*]] = llvm.trunc %[[ptr]] : i64 to i32
- // CHECK: %[[resource_1:.*]] = llvm.insertelement %[[lowHalf]]
- // CHECK: %[[highHalfI64:.*]] = llvm.lshr %[[ptr]]
- // CHECK: %[[highHalfI32:.*]] = llvm.trunc %[[highHalfI64]] : i64 to i32
- // CHECK: %[[highHalf:.*]] = llvm.and %[[highHalfI32]], %{{.*}} : i32
- // CHECK: %[[resource_2:.*]] = llvm.insertelement %[[highHalf]], %[[resource_1]]
+ // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
- // CHECK: %[[resource_3:.*]] = llvm.insertelement %[[numRecords]], %[[resource_2]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement %[[word3]], %[[resource_3]]
- // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
+ // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[] : memref<i32> -> i32
func.return %0 : i32
@@ -24,19 +17,12 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32
func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
- // CHECK: %[[ptr:.*]] = llvm.ptrtoint
- // CHECK: %[[lowHalf:.*]] = llvm.trunc %[[ptr]] : i64 to i32
- // CHECK: %[[resource_1:.*]] = llvm.insertelement %[[lowHalf]]
- // CHECK: %[[highHalfI64:.*]] = llvm.lshr %[[ptr]]
- // CHECK: %[[highHalfI32:.*]] = llvm.trunc %[[highHalfI64]] : i64 to i32
- // CHECK: %[[highHalf:.*]] = llvm.and %[[highHalfI32]], %{{.*}} : i32
- // CHECK: %[[resource_2:.*]] = llvm.insertelement %[[highHalf]], %[[resource_1]]
+ // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[resource_3:.*]] = llvm.insertelement %[[numRecords]], %[[resource_2]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // RDNA: %[[word3:.*]] = llvm.mlir.constant(822243328 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement %[[word3]], %[[resource_3]]
- // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
+ // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> i32
func.return %0 : i32
@@ -44,18 +30,18 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // RDNA: %[[word3:.*]] = llvm.mlir.constant(553807872 : i32)
- // RDNA: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // RDNA: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
- // RDNA: return %[[ret]]
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
+ // RDNA: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
+ // RDNA: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // RDNA: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
func.return %0 : i32
}
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32
func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> {
- // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32>
+ // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32>
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32>
func.return %0 : vector<2xi32>
@@ -64,8 +50,8 @@ func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vec
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8
func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
+ // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> i8
func.return %0 : i8
@@ -74,8 +60,8 @@ func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8
func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
+ // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i16
// CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8>
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8>
@@ -84,7 +70,7 @@ func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vecto
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8
func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> {
- // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
+ // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
// CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8>
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8>
@@ -94,8 +80,8 @@ func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vect
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ
func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: i32) -> f8E5M2FNUZ {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
+ // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8
// CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[loaded]] : i8 to f8E5M2FNUZ
// CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xf8E5M2FNUZ>, i32 -> f8E5M2FNUZ
@@ -105,8 +91,8 @@ func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx:
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ
func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %idx: i32) -> vector<4xf8E4M3FNUZ> {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}}
+ // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
// CHECK: %[[cast:.*]] = llvm.bitcast %[[loaded]] : i32 to vector<4xi8>
// CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[cast]] : vector<4xi8> to vector<4xf8E4M3FNUZ>
// CHECK: return %[[ret]]
@@ -117,11 +103,9 @@ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %id
// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
- // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
func.return
}
@@ -129,10 +113,9 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
func.return
}
@@ -140,7 +123,7 @@ func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx:
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8
func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) {
// CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16
- // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
+ // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32
func.return
}
@@ -148,7 +131,7 @@ func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8
func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) {
// CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32>
- // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
+ // CHECK: rocdl.raw.ptr.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32
func.return
}
@@ -157,10 +140,9 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
func.return
}
@@ -168,10 +150,9 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.atomic.fmax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
func.return
}
@@ -179,10 +160,9 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
func.return
}
@@ -190,10 +170,9 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
func.return
}
@@ -204,10 +183,9 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
// CHECK: return %[[dstCast]]
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -218,10 +196,9 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
- // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
- // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
- // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
- // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32>
+ // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+ // CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
// CHECK: return %[[dst]]
%dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32
func.return %dst : i64
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e2a35e96fbc558e..26de6a50fee38b9 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -203,6 +203,66 @@ func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
llvm.return
}
+llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
+ %stride : i16,
+ %numRecords : i32,
+ %flags : i32) -> !llvm.ptr<8> {
+ // CHECK-LABEL: rocdl.make.buffer.rsrc
+ // CHECK: %{{.*}} = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr to <8>
+ %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8>
+ llvm.return %rsrc : !llvm.ptr<8>
+}
+
+llvm.func @rocdl.raw.ptr.buffer.f32(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %aux : i32, %vdata1 : f32,
+ %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) {
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.f32
+ // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : f32
+ // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : vector<2xf32>
+ // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.load %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : vector<4xf32>
+
+ // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
+ // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf32>
+ // CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xf32>
+
+ // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
+ // CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32
+
+ %r1 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : f32
+ %r2 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xf32>
+ %r4 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xf32>
+
+ rocdl.raw.ptr.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : f32
+ rocdl.raw.ptr.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xf32>
+ rocdl.raw.ptr.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32>
+
+ rocdl.raw.ptr.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32
+ rocdl.raw.ptr.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32
+
+ llvm.return
+}
+
+
+llvm.func @rocdl.raw.ptr.buffer.i32(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %aux : i32, %vdata1 : i32,
+ %vdata2 : vector<2xi32>, %vdata4 : vector<4xi32>) {
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.i32
+ // CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}}, %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : i32
+ // CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}}, %{{.*}}, %{{.*}} %{{.*}}, %{{.*}} : i32
+ // CHECK: %{{.*}} = rocdl.raw.ptr.buffer.atomic.cmpswap %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i32
+
+ rocdl.raw.ptr.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ rocdl.raw.ptr.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ %val = rocdl.raw.ptr.buffer.atomic.cmpswap %vdata1, %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ llvm.return
+}
+
+// -----
+
+// Tests for deprecated buffer ops.
+
llvm.func @rocdl.mubuf(%rsrc : vector<4xi32>, %vindex : i32,
%offset : i32, %glc : i1,
%slc : i1, %vdata1 : vector<1xf32>,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 83467553ad5f4ce..777bef8fea5847d 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -300,6 +300,82 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
llvm.return %r0 : vector<8xf32>
}
+llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
+ %stride : i16,
+ %numRecords : i32,
+ %flags : i32) -> !llvm.ptr<8> {
+ // CHECK-LABEL: rocdl.make.buffer.rsrc
+ // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+ // CHECK: ret ptr addrspace(8) %[[rsrc]]
+ %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8>
+ llvm.return %rsrc : !llvm.ptr<8>
+}
+
+llvm.func @rocdl.raw.ptr.buffer(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %vdata1 : i32,
+ %vdata2 : vector<2xi32>,
+ %vdata4 : vector<4xi32>) {
+ %aux = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.ptr.buffer
+ // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call <4 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v4i32(ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+ // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+ %r1 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : i32
+ %r2 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32>
+ %r4 = rocdl.raw.ptr.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32>
+
+ rocdl.raw.ptr.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ rocdl.raw.ptr.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32>
+ rocdl.raw.ptr.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32>
+
+ llvm.return
+}
+
+llvm.func @rocdl.raw.ptr.buffer.atomic.f32(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %vdata1 : f32) {
+ %aux = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.f32
+ // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+ rocdl.raw.ptr.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32
+ rocdl.raw.ptr.buffer.atomic.fmax %vdata1, %rsrc, %offset, %soffset, %aux : f32
+
+ llvm.return
+}
+
+llvm.func @rocdl.raw.ptr.buffer.atomic.i32(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %vdata1 : i32) {
+ %aux = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.i32
+ // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+ rocdl.raw.ptr.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ rocdl.raw.ptr.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32
+
+ llvm.return
+}
+
+llvm.func @rocdl.raw.ptr.buffer.atomic.cmpswap(%rsrc : !llvm.ptr<8>,
+ %offset : i32, %soffset : i32,
+ %src : i32, %cmp : i32) -> i32 {
+ %aux = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.ptr.buffer.atomic.cmpswap
+ // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, ptr addrspace(8) %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: ret i32 [[val]]
+
+ %val = rocdl.raw.ptr.buffer.atomic.cmpswap %src, %cmp, %rsrc, %offset, %soffset, %aux : i32
+ llvm.return %val : i32
+}
llvm.func @rocdl.mubuf(%rsrc : vector<4xi32>, %vindex : i32,
%offset : i32, %vdata1 : vector<1xf32>,
More information about the Mlir-commits
mailing list