[llvm] 5cf9208 - [AMDGPU] Optimize block count calculations to the new ABI (#174112)

Thu Jan 8 14:03:05 PST 2026

Author: Joseph Huber
Date: 2026-01-08T16:03:00-06:00
New Revision: 5cf92086b78c6797647aaf351f4dd26fbf2244c4

URL: https://github.com/llvm/llvm-project/commit/5cf92086b78c6797647aaf351f4dd26fbf2244c4
DIFF: https://github.com/llvm/llvm-project/commit/5cf92086b78c6797647aaf351f4dd26fbf2244c4.diff

LOG: [AMDGPU] Optimize block count calculations to the new ABI (#174112)

Summary:
We already have a way to get the block count using the old grid size
lookup and dividing it by the number of threads. We did not want to make
a new intrinsic to do the same thing, so this optimization pattern
matches on this usage to automatically optimize it to the new form. This
should improve performance of old kernels by converting branches into a
simple index lookup and removing the division.

Added: 
    llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 7959a70aa5f19..0c6bf27e9a1c2 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -109,6 +110,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
                                      /*Size=*/3, /*DefaultVal=*/0);
 
   if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      !Intrinsic::getDeclarationIfExists(CI->getModule(),
+                                         Intrinsic::amdgcn_dispatch_ptr) &&
       none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
     return false;
 
@@ -323,6 +326,49 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
     }
   }
 
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+          Inst = cast<Instruction>(*Inst->user_begin());
+
+        using namespace llvm::PatternMatch;
+        if (!match(
+                Inst,
+                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                           m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+                       m_Value())))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateInBoundsGEP(
+            Builder.getInt8Ty(), CI,
+            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+                                MDNode::get(CI->getContext(), {}));
+        BlockCount->setMetadata(LLVMContext::MD_noundef,
+                                MDNode::get(CI->getContext(), {}));
+
+        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+        Inst->replaceAllUsesWith(BlockCountExt);
+        Inst->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+
   // If reqd_work_group_size is set, we can replace work group size with it.
   if (!HasReqdWorkGroupSize)
     return MadeChange;

diff  --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
new file mode 100644
index 0000000000000..25e43a0f332c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -0,0 +1,322 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck %s
+
+define i32 @num_blocks_x() {
+; CHECK-LABEL: define i32 @num_blocks_x() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  ret i32 %count_x
+}
+
+define i32 @num_blocks_y() {
+; CHECK-LABEL: define i32 @num_blocks_y() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4, !invariant.load [[META0]], !noundef [[META0]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_y = getelementptr i8, ptr addrspace(4) %implicitarg, i32 14
+  %wg_size_y = load i16, ptr addrspace(4) %i_gep_y, align 2
+  %conv_y = zext i16 %wg_size_y to i32
+  %count_y = udiv i32 %grid_size_y, %conv_y
+  ret i32 %count_y
+}
+
+define i32 @num_blocks_z() {
+; CHECK-LABEL: define i32 @num_blocks_z() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4, !invariant.load [[META0]], !noundef [[META0]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_z = getelementptr i8, ptr addrspace(4) %dispatch, i32 20
+  %grid_size_z = load i32, ptr addrspace(4) %d_gep_z, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_z = getelementptr i8, ptr addrspace(4) %implicitarg, i32 16
+  %wg_size_z = load i16, ptr addrspace(4) %i_gep_z, align 2
+  %conv_z = zext i16 %wg_size_z to i32
+  %count_z = udiv i32 %grid_size_z, %conv_z
+  ret i32 %count_z
+}
+
+define i32 @num_blocks(i32 %dim) {
+; CHECK-LABEL: define i32 @num_blocks(
+; CHECK-SAME: i32 [[DIM:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    switch i32 [[DIM]], label %[[DEFAULT:.*]] [
+; CHECK-NEXT:      i32 0, label %[[DIM_X:.*]]
+; CHECK-NEXT:      i32 1, label %[[DIM_Y:.*]]
+; CHECK-NEXT:      i32 2, label %[[DIM_Z:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[DIM_X]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[DIM_Y]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[DIM_Z]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 8
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[DEFAULT]]:
+; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RETVAL_IN:%.*]] = phi ptr addrspace(4) [ [[TMP1]], %[[DIM_X]] ], [ [[TMP0]], %[[DIM_Y]] ], [ [[TMP2]], %[[DIM_Z]] ]
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = load i32, ptr addrspace(4) [[RETVAL_IN]], align 4, !invariant.load [[META0]], !noundef [[META0]]
+; CHECK-NEXT:    ret i32 [[RETVAL_0_I]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+
+  switch i32 %dim, label %default [
+  i32 0, label %dim_x
+  i32 1, label %dim_y
+  i32 2, label %dim_z
+  ]
+
+dim_x:
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  br label %exit
+
+dim_y:
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %i_gep_y = getelementptr i8, ptr addrspace(4) %implicitarg, i32 14
+  %wg_size_y = load i16, ptr addrspace(4) %i_gep_y, align 2
+  %conv_y = zext i16 %wg_size_y to i32
+  %count_y = udiv i32 %grid_size_y, %conv_y
+  br label %exit
+
+dim_z:
+  %d_gep_z = getelementptr i8, ptr addrspace(4) %dispatch, i32 20
+  %grid_size_z = load i32, ptr addrspace(4) %d_gep_z, align 4
+  %i_gep_z = getelementptr i8, ptr addrspace(4) %implicitarg, i32 16
+  %wg_size_z = load i16, ptr addrspace(4) %i_gep_z, align 2
+  %conv_z = zext i16 %wg_size_z to i32
+  %count_z = udiv i32 %grid_size_z, %conv_z
+  br label %exit
+
+default:
+  unreachable
+
+exit:
+  %retval = phi i32 [ %count_x, %dim_x ], [ %count_y, %dim_y ], [ %count_z, %dim_z ]
+  ret i32 %retval
+}
+
+define i64 @larger() {
+; CHECK-LABEL: define i64 @larger() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
+; CHECK-NEXT:    [[CONV_GRID_X:%.*]] = zext i32 [[GRID_SIZE_X]] to i64
+; CHECK-NEXT:    ret i64 [[CONV_GRID_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i64
+  %conv_grid_x = zext i32 %grid_size_x to i64
+  %count_x = udiv i64 %conv_grid_x, %conv_x
+  ret i64 %count_x
+}
+
+define i32 @bad_offset() {
+; CHECK-LABEL: define i32 @bad_offset() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
+; CHECK-NEXT:    [[GRID_SIZE_Y:%.*]] = load i32, ptr addrspace(4) [[D_GEP_Y]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_Y]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_y = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_y = load i32, ptr addrspace(4) %d_gep_y, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_y, %conv_x
+  ret i32 %count_x
+}
+
+define i32 @dangling() {
+; CHECK-LABEL: define i32 @dangling() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    ret i32 [[GRID_SIZE_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  ret i32 %grid_size_x
+}
+
+define i32 @wrong_cast() {
+; CHECK-LABEL: define i32 @wrong_cast() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = sext i16 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = sext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  ret i32 %count_x
+}
+
+define i32 @wrong_size() {
+; CHECK-LABEL: define i32 @wrong_size() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i8, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i8 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i8, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i8 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  ret i32 %count_x
+}
+
+define i32 @wrong_intrinsic() {
+; CHECK-LABEL: define i32 @wrong_intrinsic() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[CONV_X:%.*]] = zext i16 [[WG_SIZE_X]] to i32
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i32 [[GRID_SIZE_X]], [[CONV_X]]
+; CHECK-NEXT:    ret i32 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 16
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x = zext i16 %wg_size_x to i32
+  %count_x = udiv i32 %grid_size_x, %conv_x
+  ret i32 %count_x
+}
+
+define i16 @empty_use() {
+; CHECK-LABEL: define i16 @empty_use() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
+; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[D_GEP_X]], align 4
+; CHECK-NEXT:    [[TRUNC_X:%.*]] = trunc i32 [[GRID_SIZE_X]] to i16
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[I_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[IMPLICITARG]], i64 12
+; CHECK-NEXT:    [[WG_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[I_GEP_X]], align 2
+; CHECK-NEXT:    [[COUNT_X:%.*]] = udiv i16 [[TRUNC_X]], [[WG_SIZE_X]]
+; CHECK-NEXT:    ret i16 [[COUNT_X]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %trunc_x = trunc i32 %grid_size_x to i16
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %count_x = udiv i16 %trunc_x, %wg_size_x
+  ret i16 %count_x
+}
+
+define i32 @multiple_use() {
+; CHECK-LABEL: define i32 @multiple_use() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
+; CHECK-NEXT:    [[SUM:%.*]] = shl i32 [[TMP0]], 1
+; CHECK-NEXT:    ret i32 [[SUM]]
+;
+entry:
+  %dispatch = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  %d_gep_x = getelementptr i8, ptr addrspace(4) %dispatch, i32 12
+  %grid_size_x = load i32, ptr addrspace(4) %d_gep_x, align 4
+  %implicitarg = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %i_gep_x = getelementptr i8, ptr addrspace(4) %implicitarg, i32 12
+  %wg_size_x = load i16, ptr addrspace(4) %i_gep_x, align 2
+  %conv_x_1 = zext i16 %wg_size_x to i32
+  %count_x_1 = udiv i32 %grid_size_x, %conv_x_1
+  %conv_x_2 = zext i16 %wg_size_x to i32
+  %count_x_2 = udiv i32 %grid_size_x, %conv_x_2
+  %sum = add i32 %count_x_1, %count_x_2
+  ret i32 %sum
+}
+;.
+; CHECK: [[META0]] = !{}
+;.