[llvm] [AMDGPU] Optimize block count calculations to the new ABI (PR #174112)

Wed Dec 31 14:14:18 PST 2025

================
@@ -322,6 +325,48 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     }
   }
 
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove && llvm::any_of(GroupSizes, [](Value *V) { return V; })) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize)
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = dyn_cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst))
+          Inst = Inst->getNextNode();
+
+        using namespace llvm::PatternMatch;
+        Value *Idx;
+        if (!match(Inst,
+                   m_UDiv(m_Load(m_GEP(
+                              m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                              m_Value(Idx))),
+                          m_Value())))
+          continue;
+
+        ConstantInt *Offset = dyn_cast<ConstantInt>(Idx);
+        if (!Offset ||
+            Offset->getZExtValue() != GRID_SIZE_X + I * sizeof(uint32_t))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateConstGEP1_64(Builder.getInt8Ty(), CI,
+                                                HIDDEN_BLOCK_COUNT_X +
+                                                    I * sizeof(uint32_t));
+        Value *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
----------------
arsenm wrote:

Mark with invariant load and noundef, and range if we know it? 

https://github.com/llvm/llvm-project/pull/174112