[Mlir-commits] [mlir] [mlir][amdgpu] Add `rocdl.s.waitcnt` wrapper (PR #149670)

Tue Jul 22 11:50:16 PDT 2025

================
@@ -419,6 +419,105 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
   }
 };
 
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+///     \p Vmcnt = \p Waitcnt[3:0]        (pre-gfx9)
+///     \p Vmcnt = \p Waitcnt[15:14,3:0]  (gfx9,10)
+///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11)
+///     \p Expcnt = \p Waitcnt[6:4]       (pre-gfx11)
+///     \p Expcnt = \p Waitcnt[2:0]       (gfx11)
+///     \p Lgkmcnt = \p Waitcnt[11:8]     (pre-gfx10)
+///     \p Lgkmcnt = \p Waitcnt[13:8]     (gfx10)
+///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11)
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+                                         unsigned expcnt, unsigned lgkmcnt) {
+  if (chipset.majorVersion < 9) {
+    vmcnt = std::min(15u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+  }
+  if (chipset.majorVersion == 9) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  if (chipset.majorVersion == 10) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  if (chipset.majorVersion == 11) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+  }
+  return failure();
+}
+
+struct MemoryCounterWaitOpLowering
+    : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+  MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+                              Chipset chipset)
+      : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+        chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (chipset.majorVersion >= 12) {
+      Location loc = op.getLoc();
+      if (auto ds = adaptor.getDs())
+        rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+      if (auto load = adaptor.getLoad())
+        rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+      if (auto store = adaptor.getStore())
+        rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+      if (auto exp = adaptor.getExp())
+        rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+      rewriter.eraseOp(op);
+      return success();
+    }
+
+    auto getVal = [](Attribute attr) -> unsigned {
+      if (attr)
+        return cast<IntegerAttr>(attr).getInt();
+
+      // This value will be clamped to the maximum value for the chipset.
+      return 1024 * 1024;
+    };
+    unsigned ds = getVal(adaptor.getDsAttr());
+    unsigned load = getVal(adaptor.getLoadAttr());
+    unsigned store = getVal(adaptor.getStoreAttr());
+    unsigned exp = getVal(adaptor.getExpAttr());
+
+    unsigned vmcnt = std::min(load, store);
----------------
krzysz00 wrote:

I'd argue for `load + store` here, since that'll be the total amount of outstanding VMEM operations

https://github.com/llvm/llvm-project/pull/149670