[llvm] [AMDGPU] Optimize block count calculations to the new ABI (PR #174112)

Wed Dec 31 15:05:39 PST 2025

================
@@ -322,6 +325,48 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     }
   }
 
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove && llvm::any_of(GroupSizes, [](Value *V) { return V; })) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize)
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = dyn_cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst))
+          Inst = Inst->getNextNode();
+
+        using namespace llvm::PatternMatch;
+        Value *Idx;
+        if (!match(Inst,
+                   m_UDiv(m_Load(m_GEP(
+                              m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                              m_Value(Idx))),
+                          m_Value())))
+          continue;
+
+        ConstantInt *Offset = dyn_cast<ConstantInt>(Idx);
+        if (!Offset ||
+            Offset->getZExtValue() != GRID_SIZE_X + I * sizeof(uint32_t))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateConstGEP1_64(Builder.getInt8Ty(), CI,
+                                                HIDDEN_BLOCK_COUNT_X +
+                                                    I * sizeof(uint32_t));
+        Value *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
----------------
jhuber6 wrote:

The grid size can be any u32, but there's possibly some metadata that restricts that? I'm not sure how easy it'd be to propagate here since it's on the kernel right.

https://github.com/llvm/llvm-project/pull/174112