[llvm] [AMDGPU] Optimize block count calculations to the new ABI (PR #174112)

Thu Jan 1 07:45:26 PST 2026

https://github.com/jhuber6 updated https://github.com/llvm/llvm-project/pull/174112

>From 25f08c9c7919062f5afc0666de7ff368facb342c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 1 Jan 2026 09:39:26 -0600
Subject: [PATCH 1/3] preformat

---
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    | 45 +++++++++----------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 248d7dcc9ec3e..910da9cb28d82 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
+//===-- AMDGPULowerKernelAttributes.cpp -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -66,13 +66,11 @@ class AMDGPULowerKernelAttributes : public ModulePass {
 
   bool runOnModule(Module &M) override;
 
-  StringRef getPassName() const override {
-    return "AMDGPU Kernel Attributes";
-  }
+  StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
- }
+  }
 };
 
 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
@@ -104,7 +102,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
 
   const bool HasUniformWorkGroupSize =
-    F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
 
   SmallVector<unsigned> MaxNumWorkgroups =
       AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
@@ -115,9 +113,9 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     return false;
 
   Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
-  Value *GroupSizes[3]  = {nullptr, nullptr, nullptr};
-  Value *Remainders[3]  = {nullptr, nullptr, nullptr};
-  Value *GridSizes[3]   = {nullptr, nullptr, nullptr};
+  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+  Value *Remainders[3] = {nullptr, nullptr, nullptr};
+  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
 
   const DataLayout &DL = F->getDataLayout();
 
@@ -230,13 +228,15 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
 
   bool MadeChange = false;
   if (IsV5OrAbove && HasUniformWorkGroupSize) {
-    // Under v5  __ockl_get_local_size returns the value computed by the expression:
+    // Under v5  __ockl_get_local_size returns the value computed by the
+    // expression:
     //
-    //   workgroup_id < hidden_block_count ? hidden_group_size : hidden_remainder
+    //   workgroup_id < hidden_block_count ? hidden_group_size :
+    //   hidden_remainder
     //
-    // For functions with the attribute uniform-work-group-size=true. we can evaluate
-    // workgroup_id < hidden_block_count as true, and thus hidden_group_size is returned
-    // for __ockl_get_local_size.
+    // For functions with the attribute uniform-work-group-size=true. we can
+    // evaluate workgroup_id < hidden_block_count as true, and thus
+    // hidden_group_size is returned for __ockl_get_local_size.
     for (int I = 0; I < 3; ++I) {
       Value *BlockCount = BlockCounts[I];
       if (!BlockCount)
@@ -261,7 +261,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     for (Value *Remainder : Remainders) {
       if (!Remainder)
         continue;
-      Remainder->replaceAllUsesWith(Constant::getNullValue(Remainder->getType()));
+      Remainder->replaceAllUsesWith(
+          Constant::getNullValue(Remainder->getType()));
       MadeChange = true;
     }
   } else if (HasUniformWorkGroupSize) { // Pre-V5.
@@ -302,13 +303,13 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
           continue;
 
         for (User *UMin : ZextGroupSize->users()) {
-          if (match(UMin,
-                    m_UMin(m_Sub(m_Specific(GridSize),
-                                 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize))),
-                           m_Specific(ZextGroupSize)))) {
+          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+                                       m_Mul(GroupIDIntrin,
+                                             m_Specific(ZextGroupSize))),
+                                 m_Specific(ZextGroupSize)))) {
             if (HasReqdWorkGroupSize) {
-              ConstantInt *KnownSize
-                = mdconst::extract<ConstantInt>(MD->getOperand(I));
+              ConstantInt *KnownSize =
+                  mdconst::extract<ConstantInt>(MD->getOperand(I));
               UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
                   KnownSize, UMin->getType(), false, DL));
             } else {
@@ -340,7 +341,6 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   return MadeChange;
 }
 
-
 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
 // TargetPassConfig for subtarget.
 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
@@ -364,7 +364,6 @@ bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   return MadeChange;
 }
 
-
 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
                       "AMDGPU Kernel Attributes", false, false)
 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,

>From 83b1968c0f332b72915ac8c08f2648e2954a7c4f Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Wed, 31 Dec 2025 15:08:01 -0600
Subject: [PATCH 2/3] [AMDGPU] Optimize block count calculations to the new ABI

Summary:
We already have a way to get the block count using the old grid size
lookup and dividing it by the number of threads. We did not want to make
a new intrinsic to do the same thing, so this optimization pattern
matches on this usage to automatically optimize it to the new form. This
should improve performance of old kernels by converting branches into a
simple index lookup and removing the division.

cleanup test

Comments

more tests
---
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    |  47 ++++
 .../AMDGPU/implicit-arg-block-count.ll        | 222 ++++++++++++++++++
 2 files changed, 269 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 910da9cb28d82..e47d45f55b5b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -109,6 +110,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                                      /*Size=*/3, /*DefaultVal=*/0);
 
   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      !Intrinsic::getDeclarationIfExists(CI->getModule(),
+                                         Intrinsic::amdgcn_dispatch_ptr) &&
       none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
@@ -323,6 +326,50 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     }
   }
 
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize)
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+          Inst = dyn_cast<Instruction>(*Inst->user_begin());
+
+        using namespace llvm::PatternMatch;
+        if (!match(
+                Inst,
+                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                           m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+                       m_Value())))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateConstGEP1_64(Builder.getInt8Ty(), CI,
+                                                HIDDEN_BLOCK_COUNT_X +
+                                                    I * sizeof(uint32_t));
+        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+        if (MDNode *Node = Inst->getMetadata(LLVMContext::MD_invariant_load))
+          BlockCount->setMetadata(LLVMContext::MD_invariant_load, Node);
+        if (MDNode *Node = Inst->getMetadata(LLVMContext::MD_noundef))
+          BlockCount->setMetadata(LLVMContext::MD_noundef, Node);
+
+        BlockCount =
+            cast<Instruction>(Builder.CreateZExt(BlockCount, Inst->getType()));
+
+        Inst->replaceAllUsesWith(BlockCount);
+        Inst->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+
   // If reqd_work_group_size is set, we can replace work group size with it.
   if (!HasReqdWorkGroupSize)
     return MadeChange;
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
new file mode 100644
index 0000000000000..24ef8a3932354
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -0,0 +1,222 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,simplifycfg %s | FileCheck %s
+
+define i32 @num_blocks(i32 noundef %dim) {
+; CHECK-LABEL: define i32 @num_blocks(
+; CHECK-SAME: i32 noundef [[DIM:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    switch i32 [[DIM]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[EXIT:.*]]
+; CHECK-NEXT:      i32 1, label %[[DIM_Y:.*]]
+; CHECK-NEXT:      i32 2, label %[[DIM_Z:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DIM_Y]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i64 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[DIM_Z]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP1]], i64 8
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RETVAL_0_I_IN:%.*]] = phi ptr addrspace(4) [ [[TMP2]], %[[DIM_Y]] ], [ [[TMP3]], %[[DIM_Z]] ], [ [[TMP1]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = load i32, ptr addrspace(4) [[RETVAL_0_I_IN]], align 4
+; CHECK-NEXT:    ret i32 [[RETVAL_0_I]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+
+  switch i32 %dim, label %default [
+  i32 0, label %dim_x
+  i32 1, label %dim_y
+  i32 2, label %dim_z
+  ]
+
+dim_x:
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  br label %exit
+
+dim_y:
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %i_gep_y = getelementptr i8, ptr addrspace(4) %implicitarg, i32 14
+  %wg_size_y = load i16, ptr addrspace(4) %i_gep_y, align 2
+  %conv_y = zext i16 %wg_size_y to i32
+  %count_y = udiv i32 %grid_size_y, %conv_y
+  br label %exit
+
+dim_z:
+  %d_gep_z = getelementptr i8, ptr addrspace(4) %dispatch, i32 20
+  %grid_size_z = load i32, ptr addrspace(4) %d_gep_z, align 4
+  %i_gep_z = getelementptr i8, ptr addrspace(4) %implicitarg, i32 16
+  %wg_size_z = load i16, ptr addrspace(4) %i_gep_z, align 2
+  %conv_z = zext i16 %wg_size_z to i32
+  %count_z = udiv i32 %grid_size_z, %conv_z
+  br label %exit
+
+default:
+  unreachable
+
+exit:
+  %retval = phi i32 [ %count_x, %dim_x ], [ %count_y, %dim_y ], [ %count_z, %dim_z ]
+  ret i32 %retval
+}
+
+ at dim_const = constant i32 1, align 4
+
+define i32 @constant() {
+; CHECK-LABEL: define i32 @constant() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %dim = load i32, ptr @dim_const, align 4
+
+  switch i32 %dim, label %default [
+  i32 0, label %dim_x
+  i32 1, label %dim_y
+  i32 2, label %dim_z
+  ]
+
+dim_x:
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  br label %exit
+
+dim_y:
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %i_gep_y = getelementptr i8, ptr addrspace(4) %implicitarg, i32 14
+  %wg_size_y = load i16, ptr addrspace(4) %i_gep_y, align 2
+  %conv_y = zext i16 %wg_size_y to i32
+  %count_y = udiv i32 %grid_size_y, %conv_y
+  br label %exit
+
+dim_z:
+  %d_gep_z = getelementptr i8, ptr addrspace(4) %dispatch, i32 20
+  %grid_size_z = load i32, ptr addrspace(4) %d_gep_z, align 4
+  %i_gep_z = getelementptr i8, ptr addrspace(4) %implicitarg, i32 16
+  %wg_size_z = load i16, ptr addrspace(4) %i_gep_z, align 2
+  %conv_z = zext i16 %wg_size_z to i32
+  %count_z = udiv i32 %grid_size_z, %conv_z
+  br label %exit
+
+default:
+  unreachable
+
+exit:
+  %retval = phi i32 [ %count_x, %dim_x ], [ %count_y, %dim_y ], [ %count_z, %dim_z ]
+  ret i32 %retval
+}
+
+define i64 @larger() {
+; CHECK-LABEL: define i64 @larger() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i64
+; CHECK-NEXT:    [[CONV_GRID_X:%.*]] = zext i32 [[GRID_SIZE_X]] to i64
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i64 [[CONV_GRID_X]], [[CONV_X]]
+; CHECK-NEXT:    ret i64 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i64
+  %conv_grid_x = zext i32 %grid_size_x to i64
+  %count_x = udiv i64 %conv_grid_x, %conv_x
+  ret i64 %count_x
+}
+
+define i32 @bad_offset() {
+; CHECK-LABEL: define i32 @bad_offset() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
+; CHECK-NEXT:    [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[D_GEP_Y]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_Y]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_y, %conv_x
+  ret i32 %count_x
+}
+
+define i32 @dangling() {
+; CHECK-LABEL: define i32 @dangling() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  ret i32 %grid_size_x
+}
+
+define i32 @wrong_intrinsic() {
+; CHECK-LABEL: define i32 @wrong_intrinsic() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  ret i32 %count_x
+}

>From 92c82cfe3e7bb8304c0cbd63de1c45518934f4db Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn at outlook.com>
Date: Thu, 1 Jan 2026 09:45:16 -0600
Subject: [PATCH 3/3] invariant

---
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    |  8 ++++----
 .../AMDGPU/implicit-arg-block-count.ll        | 19 ++++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index e47d45f55b5b9..1f4a0c879cdea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -355,10 +355,10 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                                                 HIDDEN_BLOCK_COUNT_X +
                                                     I * sizeof(uint32_t));
         Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
-        if (MDNode *Node = Inst->getMetadata(LLVMContext::MD_invariant_load))
-          BlockCount->setMetadata(LLVMContext::MD_invariant_load, Node);
-        if (MDNode *Node = Inst->getMetadata(LLVMContext::MD_noundef))
-          BlockCount->setMetadata(LLVMContext::MD_noundef, Node);
+        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+                                MDNode::get(CI->getContext(), {}));
+        BlockCount->setMetadata(LLVMContext::MD_noundef,
+                                MDNode::get(CI->getContext(), {}));
 
         BlockCount =
             cast<Instruction>(Builder.CreateZExt(BlockCount, Inst->getType()));
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
index 24ef8a3932354..4f20e39ba4e62 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -20,8 +20,8 @@ define i32 @num_blocks(i32 noundef %dim) {
 ; CHECK:       [[DEFAULT]]:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[RETVAL_0_I_IN:%.*]] = phi ptr addrspace(4) [ [[TMP2]], %[[DIM_Y]] ], [ [[TMP3]], %[[DIM_Z]] ], [ [[TMP1]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = load i32, ptr addrspace(4) [[RETVAL_0_I_IN]], align 4
+; CHECK-NEXT:    [[RETVAL_IN:%.*]] = phi ptr addrspace(4) [ [[TMP2]], %[[DIM_Y]] ], [ [[TMP3]], %[[DIM_Z]] ], [ [[TMP1]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = load i32, ptr addrspace(4) [[RETVAL_IN]], align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]]
 ; CHECK-NEXT:    ret i32 [[RETVAL_0_I]]
 ;
 entry:
@@ -76,7 +76,7 @@ define i32 @constant() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4, !invariant.load [[META0]], !noundef [[META0]]
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
@@ -128,16 +128,10 @@ exit:
 define i64 @larger() {
 ; CHECK-LABEL: define i64 @larger() {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
-; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
-; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
-; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i64
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
 ; CHECK-NEXT:    [[CONV_GRID_X:%.*]] = zext i32 [[GRID_SIZE_X]] to i64
-; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i64 [[CONV_GRID_X]], [[CONV_X]]
-; CHECK-NEXT:    ret i64 [[COUNT_X]]
+; CHECK-NEXT:    ret i64 [[CONV_GRID_X]]
 ;
 entry:
   %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -220,3 +214,6 @@ entry:
   %count_x = udiv i32 %grid_size_x, %conv_x
   ret i32 %count_x
 }
+;.
+; CHECK: [[META0]] = !{}
+;.