[Mlir-commits] [mlir] [mlir][amdgpu] Add `rocdl.s.waitcnt` wrapper (PR #149670)
Krzysztof Drewniak
llvmlistbot at llvm.org
Tue Jul 22 11:50:16 PDT 2025
================
@@ -419,6 +419,105 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
};
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
+/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
+/// \p Vmcnt = \p Waitcnt[15:10] (gfx11)
+/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
+/// \p Expcnt = \p Waitcnt[2:0] (gfx11)
+/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
+/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
+/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11)
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+ unsigned expcnt, unsigned lgkmcnt) {
+ if (chipset.majorVersion < 9) {
+ vmcnt = std::min(15u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+ }
+ if (chipset.majorVersion == 9) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(15u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 10) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ unsigned lowBits = vmcnt & 0xF;
+ unsigned highBits = (vmcnt >> 4) << 14;
+ unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+ return lowBits | highBits | otherCnts;
+ }
+ if (chipset.majorVersion == 11) {
+ vmcnt = std::min(63u, vmcnt);
+ expcnt = std::min(7u, expcnt);
+ lgkmcnt = std::min(63u, lgkmcnt);
+ return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+ }
+ return failure();
+}
+
+struct MemoryCounterWaitOpLowering
+ : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+ MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+ Chipset chipset)
+ : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+ chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ if (chipset.majorVersion >= 12) {
+ Location loc = op.getLoc();
+ if (auto ds = adaptor.getDs())
+ rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+ if (auto load = adaptor.getLoad())
+ rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+ if (auto store = adaptor.getStore())
+ rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+ if (auto exp = adaptor.getExp())
+ rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+ rewriter.eraseOp(op);
+ return success();
+ }
+
+ auto getVal = [](Attribute attr) -> unsigned {
+ if (attr)
+ return cast<IntegerAttr>(attr).getInt();
+
+ // This value will be clamped to the maximum value for the chipset.
+ return 1024 * 1024;
+ };
+ unsigned ds = getVal(adaptor.getDsAttr());
+ unsigned load = getVal(adaptor.getLoadAttr());
+ unsigned store = getVal(adaptor.getStoreAttr());
+ unsigned exp = getVal(adaptor.getExpAttr());
+
+ unsigned vmcnt = std::min(load, store);
----------------
krzysz00 wrote:
I'd argue for `load + store` here, since that'll be the total amount of outstanding VMEM operations
https://github.com/llvm/llvm-project/pull/149670
More information about the Mlir-commits
mailing list