[clang] [llvm] [AMDGPU] Move AMDGPUAttributor earlier with lowering kernel attributes (PR #177432)

Thu Jan 22 15:46:21 PST 2026

https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/177432

>From a7b28cf2424e788a3416ba7b411241fc4387d7d0 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi at amd.com>
Date: Thu, 22 Jan 2026 12:48:23 -0600
Subject: [PATCH 1/2] [AMDGPU] Move AMDGPUAttributor earlier with lowering
 kernel attributes

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   9 -
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 362 +++++++++++++-
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    | 443 ------------------
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 -
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  29 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 -
 ...amdgpu-max-num-workgroups-load-annotate.ll |  10 +-
 .../AMDGPU/implicit-arg-block-count.ll        |  37 +-
 .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll     |   2 +-
 .../CodeGen/AMDGPU/reqd-work-group-size.ll    |   4 +-
 llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll |   2 +-
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |   1 -
 12 files changed, 402 insertions(+), 500 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..de76dd6ab3bb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -123,15 +123,6 @@ struct AMDGPUPromoteKernelArgumentsPass
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
-ModulePass *createAMDGPULowerKernelAttributesPass();
-void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
-extern char &AMDGPULowerKernelAttributesID;
-
-struct AMDGPULowerKernelAttributesPass
-    : PassInfoMixin<AMDGPULowerKernelAttributesPass> {
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-};
-
 void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
 extern char &AMDGPULowerModuleLDSLegacyPassID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0b2ee6371da06..1f4229a2b15a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,8 +13,14 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO/Attributor.h"
 
@@ -50,6 +56,343 @@ static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
 #include "AMDGPUAttributes.def"
 };
 
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+  WORKGROUP_SIZE_X = 4,
+  WORKGROUP_SIZE_Y = 6,
+  WORKGROUP_SIZE_Z = 8,
+
+  GRID_SIZE_X = 12,
+  GRID_SIZE_Y = 16,
+  GRID_SIZE_Z = 20
+};
+
+// Field offsets to implicit kernel argument pointer.
+enum ImplicitArgOffsets {
+  HIDDEN_BLOCK_COUNT_X = 0,
+  HIDDEN_BLOCK_COUNT_Y = 4,
+  HIDDEN_BLOCK_COUNT_Z = 8,
+
+  HIDDEN_GROUP_SIZE_X = 12,
+  HIDDEN_GROUP_SIZE_Y = 14,
+  HIDDEN_GROUP_SIZE_Z = 16,
+
+  HIDDEN_REMAINDER_X = 18,
+  HIDDEN_REMAINDER_Y = 20,
+  HIDDEN_REMAINDER_Z = 22,
+};
+
+static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
+  auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
+                                 : Intrinsic::amdgcn_dispatch_ptr;
+  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
+}
+
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+                                            uint32_t MaxNumGroups) {
+  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+    return;
+
+  if (!Load->getType()->isIntegerTy(32))
+    return;
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
+static bool processUse(CallInst *CI, bool IsV5OrAbove) {
+  Function *F = CI->getFunction();
+
+  auto *MD = F->getMetadata("reqd_work_group_size");
+  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+  const bool HasUniformWorkGroupSize =
+      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+
+  SmallVector<unsigned> MaxNumWorkgroups =
+      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
+                                     /*Size=*/3, /*DefaultVal=*/0);
+
+  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+      !Intrinsic::getDeclarationIfExists(CI->getModule(),
+                                         Intrinsic::amdgcn_dispatch_ptr) &&
+      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
+    return false;
+
+  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
+  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+  Value *Remainders[3] = {nullptr, nullptr, nullptr};
+  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
+
+  const DataLayout &DL = F->getDataLayout();
+
+  // We expect to see several GEP users, casted to the appropriate type and
+  // loaded.
+  for (User *U : CI->users()) {
+    if (!U->hasOneUse())
+      continue;
+
+    int64_t Offset = 0;
+    auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
+    auto *BCI = dyn_cast<BitCastInst>(U);
+    if (!Load && !BCI) {
+      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+        continue;
+      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+      BCI = dyn_cast<BitCastInst>(*U->user_begin());
+    }
+
+    if (BCI) {
+      if (!BCI->hasOneUse())
+        continue;
+      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
+    }
+
+    if (!Load || !Load->isSimple())
+      continue;
+
+    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+    // TODO: Handle merged loads.
+    if (IsV5OrAbove) { // Base is ImplicitArgPtr.
+      switch (Offset) {
+      case HIDDEN_BLOCK_COUNT_X:
+        if (LoadSize == 4) {
+          BlockCounts[0] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+        }
+        break;
+      case HIDDEN_BLOCK_COUNT_Y:
+        if (LoadSize == 4) {
+          BlockCounts[1] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+        }
+        break;
+      case HIDDEN_BLOCK_COUNT_Z:
+        if (LoadSize == 4) {
+          BlockCounts[2] = Load;
+          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+        }
+        break;
+      case HIDDEN_GROUP_SIZE_X:
+        if (LoadSize == 2)
+          GroupSizes[0] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Y:
+        if (LoadSize == 2)
+          GroupSizes[1] = Load;
+        break;
+      case HIDDEN_GROUP_SIZE_Z:
+        if (LoadSize == 2)
+          GroupSizes[2] = Load;
+        break;
+      case HIDDEN_REMAINDER_X:
+        if (LoadSize == 2)
+          Remainders[0] = Load;
+        break;
+      case HIDDEN_REMAINDER_Y:
+        if (LoadSize == 2)
+          Remainders[1] = Load;
+        break;
+      case HIDDEN_REMAINDER_Z:
+        if (LoadSize == 2)
+          Remainders[2] = Load;
+        break;
+      default:
+        break;
+      }
+    } else { // Base is DispatchPtr.
+      switch (Offset) {
+      case WORKGROUP_SIZE_X:
+        if (LoadSize == 2)
+          GroupSizes[0] = Load;
+        break;
+      case WORKGROUP_SIZE_Y:
+        if (LoadSize == 2)
+          GroupSizes[1] = Load;
+        break;
+      case WORKGROUP_SIZE_Z:
+        if (LoadSize == 2)
+          GroupSizes[2] = Load;
+        break;
+      case GRID_SIZE_X:
+        if (LoadSize == 4)
+          GridSizes[0] = Load;
+        break;
+      case GRID_SIZE_Y:
+        if (LoadSize == 4)
+          GridSizes[1] = Load;
+        break;
+      case GRID_SIZE_Z:
+        if (LoadSize == 4)
+          GridSizes[2] = Load;
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  bool MadeChange = false;
+  if (IsV5OrAbove && HasUniformWorkGroupSize) {
+    // Under v5  __ockl_get_local_size returns the value computed by the
+    // expression:
+    //
+    //   workgroup_id < hidden_block_count ? hidden_group_size :
+    //                                       hidden_remainder
+    //
+    // For functions with the attribute uniform-work-group-size=true. we can
+    // evaluate workgroup_id < hidden_block_count as true, and thus
+    // hidden_group_size is returned for __ockl_get_local_size.
+    for (int I = 0; I < 3; ++I) {
+      Value *BlockCount = BlockCounts[I];
+      if (!BlockCount)
+        continue;
+
+      using namespace llvm::PatternMatch;
+      auto GroupIDIntrin =
+          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+      for (User *ICmp : BlockCount->users()) {
+        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+                                       m_Specific(BlockCount)))) {
+          ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
+          MadeChange = true;
+        }
+      }
+    }
+
+    // All remainders should be 0 with uniform work group size.
+    for (Value *Remainder : Remainders) {
+      if (!Remainder)
+        continue;
+      Remainder->replaceAllUsesWith(
+          Constant::getNullValue(Remainder->getType()));
+      MadeChange = true;
+    }
+  } else if (HasUniformWorkGroupSize) { // Pre-V5.
+    // Pattern match the code used to handle partial workgroup dispatches in the
+    // library implementation of get_local_size, so the entire function can be
+    // constant folded with a known group size.
+    //
+    // uint r = grid_size - group_id * group_size;
+    // get_local_size = (r < group_size) ? r : group_size;
+    //
+    // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+    // the grid_size is required to be a multiple of group_size). In this case:
+    //
+    // grid_size - (group_id * group_size) < group_size
+    // ->
+    // grid_size < group_size + (group_id * group_size)
+    //
+    // (grid_size / group_size) < 1 + group_id
+    //
+    // grid_size / group_size is at least 1, so we can conclude the select
+    // condition is false (except for group_id == 0, where the select result is
+    // the same).
+    for (int I = 0; I < 3; ++I) {
+      Value *GroupSize = GroupSizes[I];
+      Value *GridSize = GridSizes[I];
+      if (!GroupSize || !GridSize)
+        continue;
+
+      using namespace llvm::PatternMatch;
+      auto GroupIDIntrin =
+          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+      for (User *U : GroupSize->users()) {
+        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+        if (!ZextGroupSize)
+          continue;
+
+        for (User *UMin : ZextGroupSize->users()) {
+          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+                                       m_Mul(GroupIDIntrin,
+                                             m_Specific(ZextGroupSize))),
+                                 m_Specific(ZextGroupSize)))) {
+            if (HasReqdWorkGroupSize) {
+              ConstantInt *KnownSize =
+                  mdconst::extract<ConstantInt>(MD->getOperand(I));
+              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
+                  KnownSize, UMin->getType(), false, DL));
+            } else {
+              UMin->replaceAllUsesWith(ZextGroupSize);
+            }
+
+            MadeChange = true;
+          }
+        }
+      }
+    }
+  }
+
+  // Upgrade the old method of calculating the block size using the grid size.
+  // We pattern match any case where the implicit argument group size is the
+  // divisor to a dispatch packet grid size read of the same dimension.
+  if (IsV5OrAbove) {
+    for (int I = 0; I < 3; I++) {
+      Value *GroupSize = GroupSizes[I];
+      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+        continue;
+
+      for (User *U : GroupSize->users()) {
+        Instruction *Inst = cast<Instruction>(U);
+        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+          Inst = cast<Instruction>(*Inst->user_begin());
+
+        using namespace llvm::PatternMatch;
+        if (!match(
+                Inst,
+                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+                           m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+                       m_Value())))
+          continue;
+
+        IRBuilder<> Builder(Inst);
+
+        Value *GEP = Builder.CreateInBoundsGEP(
+            Builder.getInt8Ty(), CI,
+            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+                                MDNode::get(CI->getContext(), {}));
+        BlockCount->setMetadata(LLVMContext::MD_noundef,
+                                MDNode::get(CI->getContext(), {}));
+
+        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+        Inst->replaceAllUsesWith(BlockCountExt);
+        Inst->eraseFromParent();
+        MadeChange = true;
+      }
+    }
+  }
+
+  // If reqd_work_group_size is set, we can replace work group size with it.
+  if (!HasReqdWorkGroupSize)
+    return MadeChange;
+
+  for (int I = 0; I < 3; I++) {
+    Value *GroupSize = GroupSizes[I];
+    if (!GroupSize)
+      continue;
+
+    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+    GroupSize->replaceAllUsesWith(
+        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
 // We do not need to note the x workitem or workgroup id because they are always
 // initialized.
 //
@@ -1660,7 +2003,24 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
     }
   }
 
-  return A.run() == ChangeStatus::CHANGED;
+  bool Changed = A.run() == ChangeStatus::CHANGED;
+
+  // Kernel attribute lowering (merged from AMDGPULowerKernelAttributesPass)
+  bool IsV5OrAbove =
+      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
+  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
+  if (BasePtr) {
+    for (auto *F : Functions) {
+      for (Instruction &I : instructions(*F)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          if (CI->getCalledFunction() == BasePtr)
+            Changed |= processUse(CI, IsV5OrAbove);
+        }
+      }
+    }
+  }
+
+  return Changed;
 }
 } // namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
deleted file mode 100644
index fbfb71059b6b1..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This pass does attempts to make use of reqd_work_group_size metadata
-/// to eliminate loads from the dispatch packet and to constant fold OpenCL
-/// get_local_size-like functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Pass.h"
-
-#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
-
-using namespace llvm;
-
-namespace {
-
-// Field offsets in hsa_kernel_dispatch_packet_t.
-enum DispatchPackedOffsets {
-  WORKGROUP_SIZE_X = 4,
-  WORKGROUP_SIZE_Y = 6,
-  WORKGROUP_SIZE_Z = 8,
-
-  GRID_SIZE_X = 12,
-  GRID_SIZE_Y = 16,
-  GRID_SIZE_Z = 20
-};
-
-// Field offsets to implicit kernel argument pointer.
-enum ImplicitArgOffsets {
-  HIDDEN_BLOCK_COUNT_X = 0,
-  HIDDEN_BLOCK_COUNT_Y = 4,
-  HIDDEN_BLOCK_COUNT_Z = 8,
-
-  HIDDEN_GROUP_SIZE_X = 12,
-  HIDDEN_GROUP_SIZE_Y = 14,
-  HIDDEN_GROUP_SIZE_Z = 16,
-
-  HIDDEN_REMAINDER_X = 18,
-  HIDDEN_REMAINDER_Y = 20,
-  HIDDEN_REMAINDER_Z = 22,
-};
-
-class AMDGPULowerKernelAttributes : public ModulePass {
-public:
-  static char ID;
-
-  AMDGPULowerKernelAttributes() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override;
-
-  StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-  }
-};
-
-Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
-  auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
-                                 : Intrinsic::amdgcn_dispatch_ptr;
-  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
-}
-
-} // end anonymous namespace
-
-static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
-                                            uint32_t MaxNumGroups) {
-  if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
-    return;
-
-  if (!Load->getType()->isIntegerTy(32))
-    return;
-
-  // TODO: If there is existing range metadata, preserve it if it is stricter.
-  MDBuilder MDB(Load->getContext());
-  MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
-  Load->setMetadata(LLVMContext::MD_range, Range);
-}
-
-static bool processUse(CallInst *CI, bool IsV5OrAbove) {
-  Function *F = CI->getFunction();
-
-  auto *MD = F->getMetadata("reqd_work_group_size");
-  const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
-
-  const bool HasUniformWorkGroupSize =
-      F->getFnAttribute("uniform-work-group-size").getValueAsBool();
-
-  SmallVector<unsigned> MaxNumWorkgroups =
-      AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
-                                     /*Size=*/3, /*DefaultVal=*/0);
-
-  if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
-      !Intrinsic::getDeclarationIfExists(CI->getModule(),
-                                         Intrinsic::amdgcn_dispatch_ptr) &&
-      none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
-    return false;
-
-  Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
-  Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
-  Value *Remainders[3] = {nullptr, nullptr, nullptr};
-  Value *GridSizes[3] = {nullptr, nullptr, nullptr};
-
-  const DataLayout &DL = F->getDataLayout();
-
-  // We expect to see several GEP users, casted to the appropriate type and
-  // loaded.
-  for (User *U : CI->users()) {
-    if (!U->hasOneUse())
-      continue;
-
-    int64_t Offset = 0;
-    auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
-    auto *BCI = dyn_cast<BitCastInst>(U);
-    if (!Load && !BCI) {
-      if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
-        continue;
-      Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
-      BCI = dyn_cast<BitCastInst>(*U->user_begin());
-    }
-
-    if (BCI) {
-      if (!BCI->hasOneUse())
-        continue;
-      Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
-    }
-
-    if (!Load || !Load->isSimple())
-      continue;
-
-    unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
-
-    // TODO: Handle merged loads.
-    if (IsV5OrAbove) { // Base is ImplicitArgPtr.
-      switch (Offset) {
-      case HIDDEN_BLOCK_COUNT_X:
-        if (LoadSize == 4) {
-          BlockCounts[0] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
-        }
-        break;
-      case HIDDEN_BLOCK_COUNT_Y:
-        if (LoadSize == 4) {
-          BlockCounts[1] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
-        }
-        break;
-      case HIDDEN_BLOCK_COUNT_Z:
-        if (LoadSize == 4) {
-          BlockCounts[2] = Load;
-          annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
-        }
-        break;
-      case HIDDEN_GROUP_SIZE_X:
-        if (LoadSize == 2)
-          GroupSizes[0] = Load;
-        break;
-      case HIDDEN_GROUP_SIZE_Y:
-        if (LoadSize == 2)
-          GroupSizes[1] = Load;
-        break;
-      case HIDDEN_GROUP_SIZE_Z:
-        if (LoadSize == 2)
-          GroupSizes[2] = Load;
-        break;
-      case HIDDEN_REMAINDER_X:
-        if (LoadSize == 2)
-          Remainders[0] = Load;
-        break;
-      case HIDDEN_REMAINDER_Y:
-        if (LoadSize == 2)
-          Remainders[1] = Load;
-        break;
-      case HIDDEN_REMAINDER_Z:
-        if (LoadSize == 2)
-          Remainders[2] = Load;
-        break;
-      default:
-        break;
-      }
-    } else { // Base is DispatchPtr.
-      switch (Offset) {
-      case WORKGROUP_SIZE_X:
-        if (LoadSize == 2)
-          GroupSizes[0] = Load;
-        break;
-      case WORKGROUP_SIZE_Y:
-        if (LoadSize == 2)
-          GroupSizes[1] = Load;
-        break;
-      case WORKGROUP_SIZE_Z:
-        if (LoadSize == 2)
-          GroupSizes[2] = Load;
-        break;
-      case GRID_SIZE_X:
-        if (LoadSize == 4)
-          GridSizes[0] = Load;
-        break;
-      case GRID_SIZE_Y:
-        if (LoadSize == 4)
-          GridSizes[1] = Load;
-        break;
-      case GRID_SIZE_Z:
-        if (LoadSize == 4)
-          GridSizes[2] = Load;
-        break;
-      default:
-        break;
-      }
-    }
-  }
-
-  bool MadeChange = false;
-  if (IsV5OrAbove && HasUniformWorkGroupSize) {
-    // Under v5  __ockl_get_local_size returns the value computed by the
-    // expression:
-    //
-    //   workgroup_id < hidden_block_count ? hidden_group_size :
-    //                                       hidden_remainder
-    //
-    // For functions with the attribute uniform-work-group-size=true. we can
-    // evaluate workgroup_id < hidden_block_count as true, and thus
-    // hidden_group_size is returned for __ockl_get_local_size.
-    for (int I = 0; I < 3; ++I) {
-      Value *BlockCount = BlockCounts[I];
-      if (!BlockCount)
-        continue;
-
-      using namespace llvm::PatternMatch;
-      auto GroupIDIntrin =
-          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
-                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
-                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
-      for (User *ICmp : BlockCount->users()) {
-        if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
-                                       m_Specific(BlockCount)))) {
-          ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
-          MadeChange = true;
-        }
-      }
-    }
-
-    // All remainders should be 0 with uniform work group size.
-    for (Value *Remainder : Remainders) {
-      if (!Remainder)
-        continue;
-      Remainder->replaceAllUsesWith(
-          Constant::getNullValue(Remainder->getType()));
-      MadeChange = true;
-    }
-  } else if (HasUniformWorkGroupSize) { // Pre-V5.
-    // Pattern match the code used to handle partial workgroup dispatches in the
-    // library implementation of get_local_size, so the entire function can be
-    // constant folded with a known group size.
-    //
-    // uint r = grid_size - group_id * group_size;
-    // get_local_size = (r < group_size) ? r : group_size;
-    //
-    // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
-    // the grid_size is required to be a multiple of group_size). In this case:
-    //
-    // grid_size - (group_id * group_size) < group_size
-    // ->
-    // grid_size < group_size + (group_id * group_size)
-    //
-    // (grid_size / group_size) < 1 + group_id
-    //
-    // grid_size / group_size is at least 1, so we can conclude the select
-    // condition is false (except for group_id == 0, where the select result is
-    // the same).
-    for (int I = 0; I < 3; ++I) {
-      Value *GroupSize = GroupSizes[I];
-      Value *GridSize = GridSizes[I];
-      if (!GroupSize || !GridSize)
-        continue;
-
-      using namespace llvm::PatternMatch;
-      auto GroupIDIntrin =
-          I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
-                 : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
-                           : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
-      for (User *U : GroupSize->users()) {
-        auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
-        if (!ZextGroupSize)
-          continue;
-
-        for (User *UMin : ZextGroupSize->users()) {
-          if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
-                                       m_Mul(GroupIDIntrin,
-                                             m_Specific(ZextGroupSize))),
-                                 m_Specific(ZextGroupSize)))) {
-            if (HasReqdWorkGroupSize) {
-              ConstantInt *KnownSize =
-                  mdconst::extract<ConstantInt>(MD->getOperand(I));
-              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
-                  KnownSize, UMin->getType(), false, DL));
-            } else {
-              UMin->replaceAllUsesWith(ZextGroupSize);
-            }
-
-            MadeChange = true;
-          }
-        }
-      }
-    }
-  }
-
-  // Upgrade the old method of calculating the block size using the grid size.
-  // We pattern match any case where the implicit argument group size is the
-  // divisor to a dispatch packet grid size read of the same dimension.
-  if (IsV5OrAbove) {
-    for (int I = 0; I < 3; I++) {
-      Value *GroupSize = GroupSizes[I];
-      if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
-        continue;
-
-      for (User *U : GroupSize->users()) {
-        Instruction *Inst = cast<Instruction>(U);
-        if (isa<ZExtInst>(Inst) && !Inst->use_empty())
-          Inst = cast<Instruction>(*Inst->user_begin());
-
-        using namespace llvm::PatternMatch;
-        if (!match(
-                Inst,
-                m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
-                           m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
-                           m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
-                       m_Value())))
-          continue;
-
-        IRBuilder<> Builder(Inst);
-
-        Value *GEP = Builder.CreateInBoundsGEP(
-            Builder.getInt8Ty(), CI,
-            {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
-                              HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
-        Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
-        BlockCount->setMetadata(LLVMContext::MD_invariant_load,
-                                MDNode::get(CI->getContext(), {}));
-        BlockCount->setMetadata(LLVMContext::MD_noundef,
-                                MDNode::get(CI->getContext(), {}));
-
-        Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
-        Inst->replaceAllUsesWith(BlockCountExt);
-        Inst->eraseFromParent();
-        MadeChange = true;
-      }
-    }
-  }
-
-  // If reqd_work_group_size is set, we can replace work group size with it.
-  if (!HasReqdWorkGroupSize)
-    return MadeChange;
-
-  for (int I = 0; I < 3; I++) {
-    Value *GroupSize = GroupSizes[I];
-    if (!GroupSize)
-      continue;
-
-    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
-    GroupSize->replaceAllUsesWith(
-        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
-    MadeChange = true;
-  }
-
-  return MadeChange;
-}
-
-// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
-// TargetPassConfig for subtarget.
-bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
-  bool MadeChange = false;
-  bool IsV5OrAbove =
-      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
-  Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
-
-  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
-    return false;
-
-  SmallPtrSet<Instruction *, 4> HandledUses;
-  for (auto *U : BasePtr->users()) {
-    CallInst *CI = cast<CallInst>(U);
-    if (HandledUses.insert(CI).second) {
-      if (processUse(CI, IsV5OrAbove))
-        MadeChange = true;
-    }
-  }
-
-  return MadeChange;
-}
-
-INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
-                      "AMDGPU Kernel Attributes", false, false)
-INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
-                    "AMDGPU Kernel Attributes", false, false)
-
-char AMDGPULowerKernelAttributes::ID = 0;
-
-ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
-  return new AMDGPULowerKernelAttributes();
-}
-
-PreservedAnalyses
-AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
-  bool IsV5OrAbove =
-      AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
-  Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
-
-  if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
-    return PreservedAnalyses::all();
-
-  bool Changed = false;
-  for (Instruction &I : instructions(F)) {
-    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->getCalledFunction() == BasePtr)
-        Changed |= processUse(CI, IsV5OrAbove);
-    }
-  }
-
-  return !Changed ? PreservedAnalyses::all()
-                  : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index f464fbf31c754..40d12e6c10b80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -62,8 +62,6 @@ FUNCTION_PASS("amdgpu-late-codegenprepare",
                 *static_cast<const GCNTargetMachine *>(this)))
 FUNCTION_PASS("amdgpu-lower-kernel-arguments",
               AMDGPULowerKernelArgumentsPass(*this))
-FUNCTION_PASS("amdgpu-lower-kernel-attributes",
-              AMDGPULowerKernelAttributesPass())
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d25b22b2b96dc..86b6e8b878ba1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -582,7 +582,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUAtomicOptimizerPass(*PR);
   initializeAMDGPULowerKernelArgumentsPass(*PR);
   initializeAMDGPUPromoteKernelArgumentsPass(*PR);
-  initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
   initializeAMDGPUPostLegalizerCombinerPass(*PR);
   initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -874,8 +873,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
       });
 
   PB.registerPipelineEarlySimplificationEPCallback(
-      [](ModulePassManager &PM, OptimizationLevel Level,
-         ThinOrFullLTOPhase Phase) {
+      [this](ModulePassManager &PM, OptimizationLevel Level,
+             ThinOrFullLTOPhase Phase) {
         if (!isLTOPreLink(Phase)) {
           // When we are not using -fgpu-rdc, we can run accelerator code
           // selection relatively early, but still after linking to prevent
@@ -898,6 +897,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
         if (EarlyInlineAll && !EnableFunctionCalls)
           PM.addPass(AMDGPUAlwaysInlinePass());
+
+        if (!isLTOPreLink(Phase))
+          if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
+            AMDGPUAttributorOptions Opts;
+            PM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
+          }
       });
 
   PB.registerPeepholeEPCallback(
@@ -931,10 +936,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         // but before SROA to increase SROA opportunities.
         FPM.addPass(InferAddressSpacesPass());
 
-        // This should run after inlining to have any chance of doing
-        // anything, and before other cleanup optimizations.
-        FPM.addPass(AMDGPULowerKernelAttributesPass());
-
         if (Level != OptimizationLevel::O0) {
           // Promote alloca to vector before SROA and loop unroll. If we
           // manage to eliminate allocas before unroll we may choose to unroll
@@ -945,20 +946,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
 
-  // FIXME: Why is AMDGPUAttributor not in CGSCC?
-  PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
-                                            OptimizationLevel Level,
-                                            ThinOrFullLTOPhase Phase) {
-    if (Level != OptimizationLevel::O0) {
-      if (!isLTOPreLink(Phase)) {
-        if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
-          AMDGPUAttributorOptions Opts;
-          MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
-        }
-      }
-    }
-  });
-
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
         // When we are using -fgpu-rdc, we can only run accelerator code
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 782cbfa76e6e9..d85852beb803f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,7 +75,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerBufferFatPointers.cpp
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
-  AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
   AMDGPUPrepareAGPRAlloc.cpp
   AMDGPULowerExecSync.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
index 9064292129928..d8b80626f1974 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
 define i32 @use_grid_size_x_max_num_workgroups() #0 {
 ; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
@@ -111,10 +111,10 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
 !0 = !{i32 0, i32 -1}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[RNG0]] = !{i32 1, i32 37}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
index 25e43a0f332c6..914658031f12e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine %s | FileCheck %s
 
 define i32 @num_blocks_x() {
-; CHECK-LABEL: define i32 @num_blocks_x() {
+; CHECK-LABEL: define i32 @num_blocks_x(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]]
@@ -21,7 +22,8 @@ entry:
 }
 
 define i32 @num_blocks_y() {
-; CHECK-LABEL: define i32 @num_blocks_y() {
+; CHECK-LABEL: define i32 @num_blocks_y(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 4
@@ -41,7 +43,8 @@ entry:
 }
 
 define i32 @num_blocks_z() {
-; CHECK-LABEL: define i32 @num_blocks_z() {
+; CHECK-LABEL: define i32 @num_blocks_z(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 8
@@ -62,7 +65,7 @@ entry:
 
 define i32 @num_blocks(i32 %dim) {
 ; CHECK-LABEL: define i32 @num_blocks(
-; CHECK-SAME: i32 [[DIM:%.*]]) {
+; CHECK-SAME: i32 [[DIM:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    switch i32 [[DIM]], label %[[DEFAULT:.*]] [
@@ -131,7 +134,8 @@ exit:
 }
 
 define i64 @larger() {
-; CHECK-LABEL: define i64 @larger() {
+; CHECK-LABEL: define i64 @larger(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
@@ -152,7 +156,8 @@ entry:
 }
 
 define i32 @bad_offset() {
-; CHECK-LABEL: define i32 @bad_offset() {
+; CHECK-LABEL: define i32 @bad_offset(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
@@ -177,7 +182,8 @@ entry:
 }
 
 define i32 @dangling() {
-; CHECK-LABEL: define i32 @dangling() {
+; CHECK-LABEL: define i32 @dangling(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -196,7 +202,8 @@ entry:
 }
 
 define i32 @wrong_cast() {
-; CHECK-LABEL: define i32 @wrong_cast() {
+; CHECK-LABEL: define i32 @wrong_cast(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -221,7 +228,8 @@ entry:
 }
 
 define i32 @wrong_size() {
-; CHECK-LABEL: define i32 @wrong_size() {
+; CHECK-LABEL: define i32 @wrong_size(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -246,7 +254,8 @@ entry:
 }
 
 define i32 @wrong_intrinsic() {
-; CHECK-LABEL: define i32 @wrong_intrinsic() {
+; CHECK-LABEL: define i32 @wrong_intrinsic(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
@@ -271,7 +280,8 @@ entry:
 }
 
 define i16 @empty_use() {
-; CHECK-LABEL: define i16 @empty_use() {
+; CHECK-LABEL: define i16 @empty_use(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK-NEXT:    [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -296,7 +306,8 @@ entry:
 }
 
 define i32 @multiple_use() {
-; CHECK-LABEL: define i32 @multiple_use() {
+; CHECK-LABEL: define i32 @multiple_use(
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 3563e737f5520..1fa939977fc7e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
 define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index 8c4bd4e882ac6..aa8feb59bbbda 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
 
 ; CHECK-LABEL: @invalid_reqd_work_group_size(
 ; CHECK: load i16,
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 6a88be6e55859..d655306dee19d 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -826,5 +826,5 @@ entry:
 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
 
 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
-; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "uniform-work-group-size"="false" }
+; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) }
 attributes #0 = { nounwind }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d078403135963..d5cc5be2b6aa9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -165,7 +165,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPULowerExecSync.cpp",
     "AMDGPULowerIntrinsics.cpp",
     "AMDGPULowerKernelArguments.cpp",
-    "AMDGPULowerKernelAttributes.cpp",
     "AMDGPULowerModuleLDSPass.cpp",
     "AMDGPULowerVGPREncoding.cpp",
     "AMDGPUMCInstLower.cpp",

>From c61763845288ad74246466f167dff32154057c75 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi at amd.com>
Date: Thu, 22 Jan 2026 17:46:00 -0600
Subject: [PATCH 2/2] Add missed change on a test

---
 .../amdgcnspirv-uses-amdgpu-abi.cpp           | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
index 8f92d1fed1f9f..b6645409722aa 100644
--- a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
+++ b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
@@ -81,7 +81,7 @@ __global__ void k4(SingleElement) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local amdgpu_kernel void @_Z2k55ByRef(
-// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -117,7 +117,7 @@ __global__ void k7(unsigned*) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f0s(
-// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -129,7 +129,7 @@ __device__ void f0(short) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f1j(
-// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -141,7 +141,7 @@ __device__ void f1(unsigned) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f2d(
-// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -153,7 +153,7 @@ __device__ void f2(double) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f311Transparent(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -165,7 +165,7 @@ __device__ void f3(Transparent) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f413SingleElement(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -177,7 +177,7 @@ __device__ void f4(SingleElement) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f55ByRef(
-// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -189,7 +189,7 @@ __device__ void f5(ByRef) { }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z2f6Dv1_jDv2_jDv3_jDv4_j(
-// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret void
 //
@@ -201,7 +201,7 @@ __device__ void f6(V1, V2, V3, V4) { }
 // AMDGCNSPIRV-NEXT:    ret i16 0
 //
 // AMDGPU-LABEL: define dso_local noundef signext i16 @_Z2f7v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i16 0
 //
@@ -213,7 +213,7 @@ __device__ short f7() { return 0; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z2f8v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -225,7 +225,7 @@ __device__ unsigned f8() { return 0; }
 // AMDGCNSPIRV-NEXT:    ret double 0.000000e+00
 //
 // AMDGPU-LABEL: define dso_local noundef double @_Z2f9v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret double 0.000000e+00
 //
@@ -237,7 +237,7 @@ __device__ double f9() { return 0.; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f10v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -249,7 +249,7 @@ __device__ Transparent f10() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret i32 0
 //
 // AMDGPU-LABEL: define dso_local noundef i32 @_Z3f11v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret i32 0
 //
@@ -262,7 +262,7 @@ __device__ SingleElement f11() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret void
 //
 // AMDGPU-LABEL: define dso_local void @_Z3f12v(
-// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 4 dereferenceable(68) [[AGG_RESULT]], i8 0, i64 68, i1 false)
 // AMDGPU-NEXT:    ret void
@@ -275,7 +275,7 @@ __device__ ByRef f12() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <1 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <1 x i32> @_Z3f13v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <1 x i32> zeroinitializer
 //
@@ -287,7 +287,7 @@ __device__ V1 f13() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <2 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <2 x i32> @_Z3f14v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <2 x i32> zeroinitializer
 //
@@ -299,7 +299,7 @@ __device__ V2 f14() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <3 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <3 x i32> @_Z3f15v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <3 x i32> zeroinitializer
 //
@@ -311,7 +311,7 @@ __device__ V3 f15() { return {}; }
 // AMDGCNSPIRV-NEXT:    ret <4 x i32> zeroinitializer
 //
 // AMDGPU-LABEL: define dso_local noundef <4 x i32> @_Z3f16v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
 // AMDGPU-NEXT:  [[ENTRY:.*:]]
 // AMDGPU-NEXT:    ret <4 x i32> zeroinitializer
 //
@@ -319,3 +319,11 @@ __device__ V4 f16() { return {}; }
 //.
 // AMDGCNSPIRV: [[META9]] = !{i32 1024, i32 1, i32 1}
 //.
+
+// For recording purpose of AMDGPU
+// attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" }
+// attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" }
+// attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #5 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }