[clang] [llvm] [AMDGPU] Move AMDGPUAttributor earlier with lowering kernel attributes (PR #177432)
Yoonseo Choi via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 22 15:46:21 PST 2026
https://github.com/yoonseoch updated https://github.com/llvm/llvm-project/pull/177432
>From a7b28cf2424e788a3416ba7b411241fc4387d7d0 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi at amd.com>
Date: Thu, 22 Jan 2026 12:48:23 -0600
Subject: [PATCH 1/2] [AMDGPU] Move AMDGPUAttributor earlier with lowering
kernel attributes
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 9 -
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 362 +++++++++++++-
.../AMDGPU/AMDGPULowerKernelAttributes.cpp | 443 ------------------
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 -
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 29 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 -
...amdgpu-max-num-workgroups-load-annotate.ll | 10 +-
.../AMDGPU/implicit-arg-block-count.ll | 37 +-
.../CodeGen/AMDGPU/implicit-arg-v5-opt.ll | 2 +-
.../CodeGen/AMDGPU/reqd-work-group-size.ll | 4 +-
llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll | 2 +-
.../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 -
12 files changed, 402 insertions(+), 500 deletions(-)
delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 5df11a45b4889..de76dd6ab3bb5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -123,15 +123,6 @@ struct AMDGPUPromoteKernelArgumentsPass
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
-ModulePass *createAMDGPULowerKernelAttributesPass();
-void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
-extern char &AMDGPULowerKernelAttributesID;
-
-struct AMDGPULowerKernelAttributesPass
- : PassInfoMixin<AMDGPULowerKernelAttributesPass> {
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-};
-
void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
extern char &AMDGPULowerModuleLDSLegacyPassID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0b2ee6371da06..1f4229a2b15a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -13,8 +13,14 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/Attributor.h"
@@ -50,6 +56,343 @@ static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
#include "AMDGPUAttributes.def"
};
+// Field offsets in hsa_kernel_dispatch_packet_t.
+enum DispatchPackedOffsets {
+ WORKGROUP_SIZE_X = 4,
+ WORKGROUP_SIZE_Y = 6,
+ WORKGROUP_SIZE_Z = 8,
+
+ GRID_SIZE_X = 12,
+ GRID_SIZE_Y = 16,
+ GRID_SIZE_Z = 20
+};
+
+// Field offsets to implicit kernel argument pointer.
+enum ImplicitArgOffsets {
+ HIDDEN_BLOCK_COUNT_X = 0,
+ HIDDEN_BLOCK_COUNT_Y = 4,
+ HIDDEN_BLOCK_COUNT_Z = 8,
+
+ HIDDEN_GROUP_SIZE_X = 12,
+ HIDDEN_GROUP_SIZE_Y = 14,
+ HIDDEN_GROUP_SIZE_Z = 16,
+
+ HIDDEN_REMAINDER_X = 18,
+ HIDDEN_REMAINDER_Y = 20,
+ HIDDEN_REMAINDER_Z = 22,
+};
+
+static Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
+ auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
+ : Intrinsic::amdgcn_dispatch_ptr;
+ return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
+}
+
+static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
+ uint32_t MaxNumGroups) {
+ if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
+ return;
+
+ if (!Load->getType()->isIntegerTy(32))
+ return;
+
+ // TODO: If there is existing range metadata, preserve it if it is stricter.
+ MDBuilder MDB(Load->getContext());
+ MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
+ Load->setMetadata(LLVMContext::MD_range, Range);
+}
+
+static bool processUse(CallInst *CI, bool IsV5OrAbove) {
+ Function *F = CI->getFunction();
+
+ auto *MD = F->getMetadata("reqd_work_group_size");
+ const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
+
+ const bool HasUniformWorkGroupSize =
+ F->getFnAttribute("uniform-work-group-size").getValueAsBool();
+
+ SmallVector<unsigned> MaxNumWorkgroups =
+ AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
+ /*Size=*/3, /*DefaultVal=*/0);
+
+ if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
+ !Intrinsic::getDeclarationIfExists(CI->getModule(),
+ Intrinsic::amdgcn_dispatch_ptr) &&
+ none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
+ return false;
+
+ Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
+ Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
+ Value *Remainders[3] = {nullptr, nullptr, nullptr};
+ Value *GridSizes[3] = {nullptr, nullptr, nullptr};
+
+ const DataLayout &DL = F->getDataLayout();
+
+ // We expect to see several GEP users, casted to the appropriate type and
+ // loaded.
+ for (User *U : CI->users()) {
+ if (!U->hasOneUse())
+ continue;
+
+ int64_t Offset = 0;
+ auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
+ auto *BCI = dyn_cast<BitCastInst>(U);
+ if (!Load && !BCI) {
+ if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
+ continue;
+ Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
+ BCI = dyn_cast<BitCastInst>(*U->user_begin());
+ }
+
+ if (BCI) {
+ if (!BCI->hasOneUse())
+ continue;
+ Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
+ }
+
+ if (!Load || !Load->isSimple())
+ continue;
+
+ unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
+
+ // TODO: Handle merged loads.
+ if (IsV5OrAbove) { // Base is ImplicitArgPtr.
+ switch (Offset) {
+ case HIDDEN_BLOCK_COUNT_X:
+ if (LoadSize == 4) {
+ BlockCounts[0] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
+ }
+ break;
+ case HIDDEN_BLOCK_COUNT_Y:
+ if (LoadSize == 4) {
+ BlockCounts[1] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
+ }
+ break;
+ case HIDDEN_BLOCK_COUNT_Z:
+ if (LoadSize == 4) {
+ BlockCounts[2] = Load;
+ annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
+ }
+ break;
+ case HIDDEN_GROUP_SIZE_X:
+ if (LoadSize == 2)
+ GroupSizes[0] = Load;
+ break;
+ case HIDDEN_GROUP_SIZE_Y:
+ if (LoadSize == 2)
+ GroupSizes[1] = Load;
+ break;
+ case HIDDEN_GROUP_SIZE_Z:
+ if (LoadSize == 2)
+ GroupSizes[2] = Load;
+ break;
+ case HIDDEN_REMAINDER_X:
+ if (LoadSize == 2)
+ Remainders[0] = Load;
+ break;
+ case HIDDEN_REMAINDER_Y:
+ if (LoadSize == 2)
+ Remainders[1] = Load;
+ break;
+ case HIDDEN_REMAINDER_Z:
+ if (LoadSize == 2)
+ Remainders[2] = Load;
+ break;
+ default:
+ break;
+ }
+ } else { // Base is DispatchPtr.
+ switch (Offset) {
+ case WORKGROUP_SIZE_X:
+ if (LoadSize == 2)
+ GroupSizes[0] = Load;
+ break;
+ case WORKGROUP_SIZE_Y:
+ if (LoadSize == 2)
+ GroupSizes[1] = Load;
+ break;
+ case WORKGROUP_SIZE_Z:
+ if (LoadSize == 2)
+ GroupSizes[2] = Load;
+ break;
+ case GRID_SIZE_X:
+ if (LoadSize == 4)
+ GridSizes[0] = Load;
+ break;
+ case GRID_SIZE_Y:
+ if (LoadSize == 4)
+ GridSizes[1] = Load;
+ break;
+ case GRID_SIZE_Z:
+ if (LoadSize == 4)
+ GridSizes[2] = Load;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ bool MadeChange = false;
+ if (IsV5OrAbove && HasUniformWorkGroupSize) {
+ // Under v5 __ockl_get_local_size returns the value computed by the
+ // expression:
+ //
+ // workgroup_id < hidden_block_count ? hidden_group_size :
+ // hidden_remainder
+ //
+ // For functions with the attribute uniform-work-group-size=true. we can
+ // evaluate workgroup_id < hidden_block_count as true, and thus
+ // hidden_group_size is returned for __ockl_get_local_size.
+ for (int I = 0; I < 3; ++I) {
+ Value *BlockCount = BlockCounts[I];
+ if (!BlockCount)
+ continue;
+
+ using namespace llvm::PatternMatch;
+ auto GroupIDIntrin =
+ I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+ : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+ : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+ for (User *ICmp : BlockCount->users()) {
+ if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
+ m_Specific(BlockCount)))) {
+ ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
+ MadeChange = true;
+ }
+ }
+ }
+
+ // All remainders should be 0 with uniform work group size.
+ for (Value *Remainder : Remainders) {
+ if (!Remainder)
+ continue;
+ Remainder->replaceAllUsesWith(
+ Constant::getNullValue(Remainder->getType()));
+ MadeChange = true;
+ }
+ } else if (HasUniformWorkGroupSize) { // Pre-V5.
+ // Pattern match the code used to handle partial workgroup dispatches in the
+ // library implementation of get_local_size, so the entire function can be
+ // constant folded with a known group size.
+ //
+ // uint r = grid_size - group_id * group_size;
+ // get_local_size = (r < group_size) ? r : group_size;
+ //
+ // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
+ // the grid_size is required to be a multiple of group_size). In this case:
+ //
+ // grid_size - (group_id * group_size) < group_size
+ // ->
+ // grid_size < group_size + (group_id * group_size)
+ //
+ // (grid_size / group_size) < 1 + group_id
+ //
+ // grid_size / group_size is at least 1, so we can conclude the select
+ // condition is false (except for group_id == 0, where the select result is
+ // the same).
+ for (int I = 0; I < 3; ++I) {
+ Value *GroupSize = GroupSizes[I];
+ Value *GridSize = GridSizes[I];
+ if (!GroupSize || !GridSize)
+ continue;
+
+ using namespace llvm::PatternMatch;
+ auto GroupIDIntrin =
+ I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
+ : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
+ : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
+
+ for (User *U : GroupSize->users()) {
+ auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
+ if (!ZextGroupSize)
+ continue;
+
+ for (User *UMin : ZextGroupSize->users()) {
+ if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
+ m_Mul(GroupIDIntrin,
+ m_Specific(ZextGroupSize))),
+ m_Specific(ZextGroupSize)))) {
+ if (HasReqdWorkGroupSize) {
+ ConstantInt *KnownSize =
+ mdconst::extract<ConstantInt>(MD->getOperand(I));
+ UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
+ KnownSize, UMin->getType(), false, DL));
+ } else {
+ UMin->replaceAllUsesWith(ZextGroupSize);
+ }
+
+ MadeChange = true;
+ }
+ }
+ }
+ }
+ }
+
+ // Upgrade the old method of calculating the block size using the grid size.
+ // We pattern match any case where the implicit argument group size is the
+ // divisor to a dispatch packet grid size read of the same dimension.
+ if (IsV5OrAbove) {
+ for (int I = 0; I < 3; I++) {
+ Value *GroupSize = GroupSizes[I];
+ if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
+ continue;
+
+ for (User *U : GroupSize->users()) {
+ Instruction *Inst = cast<Instruction>(U);
+ if (isa<ZExtInst>(Inst) && !Inst->use_empty())
+ Inst = cast<Instruction>(*Inst->user_begin());
+
+ using namespace llvm::PatternMatch;
+ if (!match(
+ Inst,
+ m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
+ m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
+ m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
+ m_Value())))
+ continue;
+
+ IRBuilder<> Builder(Inst);
+
+ Value *GEP = Builder.CreateInBoundsGEP(
+ Builder.getInt8Ty(), CI,
+ {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
+ HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
+ Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
+ BlockCount->setMetadata(LLVMContext::MD_invariant_load,
+ MDNode::get(CI->getContext(), {}));
+ BlockCount->setMetadata(LLVMContext::MD_noundef,
+ MDNode::get(CI->getContext(), {}));
+
+ Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
+ Inst->replaceAllUsesWith(BlockCountExt);
+ Inst->eraseFromParent();
+ MadeChange = true;
+ }
+ }
+ }
+
+ // If reqd_work_group_size is set, we can replace work group size with it.
+ if (!HasReqdWorkGroupSize)
+ return MadeChange;
+
+ for (int I = 0; I < 3; I++) {
+ Value *GroupSize = GroupSizes[I];
+ if (!GroupSize)
+ continue;
+
+ ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
+ GroupSize->replaceAllUsesWith(
+ ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
// We do not need to note the x workitem or workgroup id because they are always
// initialized.
//
@@ -1660,7 +2003,24 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM,
}
}
- return A.run() == ChangeStatus::CHANGED;
+ bool Changed = A.run() == ChangeStatus::CHANGED;
+
+ // Kernel attribute lowering (merged from AMDGPULowerKernelAttributesPass)
+ bool IsV5OrAbove =
+ AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
+ Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
+ if (BasePtr) {
+ for (auto *F : Functions) {
+ for (Instruction &I : instructions(*F)) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ if (CI->getCalledFunction() == BasePtr)
+ Changed |= processUse(CI, IsV5OrAbove);
+ }
+ }
+ }
+ }
+
+ return Changed;
}
} // namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
deleted file mode 100644
index fbfb71059b6b1..0000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-//===-- AMDGPULowerKernelAttributes.cpp------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file This pass does attempts to make use of reqd_work_group_size metadata
-/// to eliminate loads from the dispatch packet and to constant fold OpenCL
-/// get_local_size-like functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Pass.h"
-
-#define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
-
-using namespace llvm;
-
-namespace {
-
-// Field offsets in hsa_kernel_dispatch_packet_t.
-enum DispatchPackedOffsets {
- WORKGROUP_SIZE_X = 4,
- WORKGROUP_SIZE_Y = 6,
- WORKGROUP_SIZE_Z = 8,
-
- GRID_SIZE_X = 12,
- GRID_SIZE_Y = 16,
- GRID_SIZE_Z = 20
-};
-
-// Field offsets to implicit kernel argument pointer.
-enum ImplicitArgOffsets {
- HIDDEN_BLOCK_COUNT_X = 0,
- HIDDEN_BLOCK_COUNT_Y = 4,
- HIDDEN_BLOCK_COUNT_Z = 8,
-
- HIDDEN_GROUP_SIZE_X = 12,
- HIDDEN_GROUP_SIZE_Y = 14,
- HIDDEN_GROUP_SIZE_Z = 16,
-
- HIDDEN_REMAINDER_X = 18,
- HIDDEN_REMAINDER_Y = 20,
- HIDDEN_REMAINDER_Z = 22,
-};
-
-class AMDGPULowerKernelAttributes : public ModulePass {
-public:
- static char ID;
-
- AMDGPULowerKernelAttributes() : ModulePass(ID) {}
-
- bool runOnModule(Module &M) override;
-
- StringRef getPassName() const override { return "AMDGPU Kernel Attributes"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-};
-
-Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
- auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
- : Intrinsic::amdgcn_dispatch_ptr;
- return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
-}
-
-} // end anonymous namespace
-
-static void annotateGridSizeLoadWithRangeMD(LoadInst *Load,
- uint32_t MaxNumGroups) {
- if (MaxNumGroups == 0 || MaxNumGroups == std::numeric_limits<uint32_t>::max())
- return;
-
- if (!Load->getType()->isIntegerTy(32))
- return;
-
- // TODO: If there is existing range metadata, preserve it if it is stricter.
- MDBuilder MDB(Load->getContext());
- MDNode *Range = MDB.createRange(APInt(32, 1), APInt(32, MaxNumGroups + 1));
- Load->setMetadata(LLVMContext::MD_range, Range);
-}
-
-static bool processUse(CallInst *CI, bool IsV5OrAbove) {
- Function *F = CI->getFunction();
-
- auto *MD = F->getMetadata("reqd_work_group_size");
- const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
-
- const bool HasUniformWorkGroupSize =
- F->getFnAttribute("uniform-work-group-size").getValueAsBool();
-
- SmallVector<unsigned> MaxNumWorkgroups =
- AMDGPU::getIntegerVecAttribute(*F, "amdgpu-max-num-workgroups",
- /*Size=*/3, /*DefaultVal=*/0);
-
- if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize &&
- !Intrinsic::getDeclarationIfExists(CI->getModule(),
- Intrinsic::amdgcn_dispatch_ptr) &&
- none_of(MaxNumWorkgroups, [](unsigned X) { return X != 0; }))
- return false;
-
- Value *BlockCounts[3] = {nullptr, nullptr, nullptr};
- Value *GroupSizes[3] = {nullptr, nullptr, nullptr};
- Value *Remainders[3] = {nullptr, nullptr, nullptr};
- Value *GridSizes[3] = {nullptr, nullptr, nullptr};
-
- const DataLayout &DL = F->getDataLayout();
-
- // We expect to see several GEP users, casted to the appropriate type and
- // loaded.
- for (User *U : CI->users()) {
- if (!U->hasOneUse())
- continue;
-
- int64_t Offset = 0;
- auto *Load = dyn_cast<LoadInst>(U); // Load from ImplicitArgPtr/DispatchPtr?
- auto *BCI = dyn_cast<BitCastInst>(U);
- if (!Load && !BCI) {
- if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
- continue;
- Load = dyn_cast<LoadInst>(*U->user_begin()); // Load from GEP?
- BCI = dyn_cast<BitCastInst>(*U->user_begin());
- }
-
- if (BCI) {
- if (!BCI->hasOneUse())
- continue;
- Load = dyn_cast<LoadInst>(*BCI->user_begin()); // Load from BCI?
- }
-
- if (!Load || !Load->isSimple())
- continue;
-
- unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
-
- // TODO: Handle merged loads.
- if (IsV5OrAbove) { // Base is ImplicitArgPtr.
- switch (Offset) {
- case HIDDEN_BLOCK_COUNT_X:
- if (LoadSize == 4) {
- BlockCounts[0] = Load;
- annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[0]);
- }
- break;
- case HIDDEN_BLOCK_COUNT_Y:
- if (LoadSize == 4) {
- BlockCounts[1] = Load;
- annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[1]);
- }
- break;
- case HIDDEN_BLOCK_COUNT_Z:
- if (LoadSize == 4) {
- BlockCounts[2] = Load;
- annotateGridSizeLoadWithRangeMD(Load, MaxNumWorkgroups[2]);
- }
- break;
- case HIDDEN_GROUP_SIZE_X:
- if (LoadSize == 2)
- GroupSizes[0] = Load;
- break;
- case HIDDEN_GROUP_SIZE_Y:
- if (LoadSize == 2)
- GroupSizes[1] = Load;
- break;
- case HIDDEN_GROUP_SIZE_Z:
- if (LoadSize == 2)
- GroupSizes[2] = Load;
- break;
- case HIDDEN_REMAINDER_X:
- if (LoadSize == 2)
- Remainders[0] = Load;
- break;
- case HIDDEN_REMAINDER_Y:
- if (LoadSize == 2)
- Remainders[1] = Load;
- break;
- case HIDDEN_REMAINDER_Z:
- if (LoadSize == 2)
- Remainders[2] = Load;
- break;
- default:
- break;
- }
- } else { // Base is DispatchPtr.
- switch (Offset) {
- case WORKGROUP_SIZE_X:
- if (LoadSize == 2)
- GroupSizes[0] = Load;
- break;
- case WORKGROUP_SIZE_Y:
- if (LoadSize == 2)
- GroupSizes[1] = Load;
- break;
- case WORKGROUP_SIZE_Z:
- if (LoadSize == 2)
- GroupSizes[2] = Load;
- break;
- case GRID_SIZE_X:
- if (LoadSize == 4)
- GridSizes[0] = Load;
- break;
- case GRID_SIZE_Y:
- if (LoadSize == 4)
- GridSizes[1] = Load;
- break;
- case GRID_SIZE_Z:
- if (LoadSize == 4)
- GridSizes[2] = Load;
- break;
- default:
- break;
- }
- }
- }
-
- bool MadeChange = false;
- if (IsV5OrAbove && HasUniformWorkGroupSize) {
- // Under v5 __ockl_get_local_size returns the value computed by the
- // expression:
- //
- // workgroup_id < hidden_block_count ? hidden_group_size :
- // hidden_remainder
- //
- // For functions with the attribute uniform-work-group-size=true. we can
- // evaluate workgroup_id < hidden_block_count as true, and thus
- // hidden_group_size is returned for __ockl_get_local_size.
- for (int I = 0; I < 3; ++I) {
- Value *BlockCount = BlockCounts[I];
- if (!BlockCount)
- continue;
-
- using namespace llvm::PatternMatch;
- auto GroupIDIntrin =
- I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
- : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
- : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
- for (User *ICmp : BlockCount->users()) {
- if (match(ICmp, m_SpecificICmp(ICmpInst::ICMP_ULT, GroupIDIntrin,
- m_Specific(BlockCount)))) {
- ICmp->replaceAllUsesWith(llvm::ConstantInt::getTrue(ICmp->getType()));
- MadeChange = true;
- }
- }
- }
-
- // All remainders should be 0 with uniform work group size.
- for (Value *Remainder : Remainders) {
- if (!Remainder)
- continue;
- Remainder->replaceAllUsesWith(
- Constant::getNullValue(Remainder->getType()));
- MadeChange = true;
- }
- } else if (HasUniformWorkGroupSize) { // Pre-V5.
- // Pattern match the code used to handle partial workgroup dispatches in the
- // library implementation of get_local_size, so the entire function can be
- // constant folded with a known group size.
- //
- // uint r = grid_size - group_id * group_size;
- // get_local_size = (r < group_size) ? r : group_size;
- //
- // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
- // the grid_size is required to be a multiple of group_size). In this case:
- //
- // grid_size - (group_id * group_size) < group_size
- // ->
- // grid_size < group_size + (group_id * group_size)
- //
- // (grid_size / group_size) < 1 + group_id
- //
- // grid_size / group_size is at least 1, so we can conclude the select
- // condition is false (except for group_id == 0, where the select result is
- // the same).
- for (int I = 0; I < 3; ++I) {
- Value *GroupSize = GroupSizes[I];
- Value *GridSize = GridSizes[I];
- if (!GroupSize || !GridSize)
- continue;
-
- using namespace llvm::PatternMatch;
- auto GroupIDIntrin =
- I == 0 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_x>()
- : (I == 1 ? m_Intrinsic<Intrinsic::amdgcn_workgroup_id_y>()
- : m_Intrinsic<Intrinsic::amdgcn_workgroup_id_z>());
-
- for (User *U : GroupSize->users()) {
- auto *ZextGroupSize = dyn_cast<ZExtInst>(U);
- if (!ZextGroupSize)
- continue;
-
- for (User *UMin : ZextGroupSize->users()) {
- if (match(UMin, m_UMin(m_Sub(m_Specific(GridSize),
- m_Mul(GroupIDIntrin,
- m_Specific(ZextGroupSize))),
- m_Specific(ZextGroupSize)))) {
- if (HasReqdWorkGroupSize) {
- ConstantInt *KnownSize =
- mdconst::extract<ConstantInt>(MD->getOperand(I));
- UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
- KnownSize, UMin->getType(), false, DL));
- } else {
- UMin->replaceAllUsesWith(ZextGroupSize);
- }
-
- MadeChange = true;
- }
- }
- }
- }
- }
-
- // Upgrade the old method of calculating the block size using the grid size.
- // We pattern match any case where the implicit argument group size is the
- // divisor to a dispatch packet grid size read of the same dimension.
- if (IsV5OrAbove) {
- for (int I = 0; I < 3; I++) {
- Value *GroupSize = GroupSizes[I];
- if (!GroupSize || !GroupSize->getType()->isIntegerTy(16))
- continue;
-
- for (User *U : GroupSize->users()) {
- Instruction *Inst = cast<Instruction>(U);
- if (isa<ZExtInst>(Inst) && !Inst->use_empty())
- Inst = cast<Instruction>(*Inst->user_begin());
-
- using namespace llvm::PatternMatch;
- if (!match(
- Inst,
- m_UDiv(m_ZExtOrSelf(m_Load(m_GEP(
- m_Intrinsic<Intrinsic::amdgcn_dispatch_ptr>(),
- m_SpecificInt(GRID_SIZE_X + I * sizeof(uint32_t))))),
- m_Value())))
- continue;
-
- IRBuilder<> Builder(Inst);
-
- Value *GEP = Builder.CreateInBoundsGEP(
- Builder.getInt8Ty(), CI,
- {ConstantInt::get(Type::getInt64Ty(CI->getContext()),
- HIDDEN_BLOCK_COUNT_X + I * sizeof(uint32_t))});
- Instruction *BlockCount = Builder.CreateLoad(Builder.getInt32Ty(), GEP);
- BlockCount->setMetadata(LLVMContext::MD_invariant_load,
- MDNode::get(CI->getContext(), {}));
- BlockCount->setMetadata(LLVMContext::MD_noundef,
- MDNode::get(CI->getContext(), {}));
-
- Value *BlockCountExt = Builder.CreateZExt(BlockCount, Inst->getType());
- Inst->replaceAllUsesWith(BlockCountExt);
- Inst->eraseFromParent();
- MadeChange = true;
- }
- }
- }
-
- // If reqd_work_group_size is set, we can replace work group size with it.
- if (!HasReqdWorkGroupSize)
- return MadeChange;
-
- for (int I = 0; I < 3; I++) {
- Value *GroupSize = GroupSizes[I];
- if (!GroupSize)
- continue;
-
- ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
- GroupSize->replaceAllUsesWith(
- ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
- MadeChange = true;
- }
-
- return MadeChange;
-}
-
-// TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
-// TargetPassConfig for subtarget.
-bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
- bool MadeChange = false;
- bool IsV5OrAbove =
- AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
- Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
-
- if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
- return false;
-
- SmallPtrSet<Instruction *, 4> HandledUses;
- for (auto *U : BasePtr->users()) {
- CallInst *CI = cast<CallInst>(U);
- if (HandledUses.insert(CI).second) {
- if (processUse(CI, IsV5OrAbove))
- MadeChange = true;
- }
- }
-
- return MadeChange;
-}
-
-INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
- "AMDGPU Kernel Attributes", false, false)
-INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE,
- "AMDGPU Kernel Attributes", false, false)
-
-char AMDGPULowerKernelAttributes::ID = 0;
-
-ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
- return new AMDGPULowerKernelAttributes();
-}
-
-PreservedAnalyses
-AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool IsV5OrAbove =
- AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
- Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
-
- if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
- return PreservedAnalyses::all();
-
- bool Changed = false;
- for (Instruction &I : instructions(F)) {
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- if (CI->getCalledFunction() == BasePtr)
- Changed |= processUse(CI, IsV5OrAbove);
- }
- }
-
- return !Changed ? PreservedAnalyses::all()
- : PreservedAnalyses::none().preserveSet<CFGAnalyses>();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index f464fbf31c754..40d12e6c10b80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -62,8 +62,6 @@ FUNCTION_PASS("amdgpu-late-codegenprepare",
*static_cast<const GCNTargetMachine *>(this)))
FUNCTION_PASS("amdgpu-lower-kernel-arguments",
AMDGPULowerKernelArgumentsPass(*this))
-FUNCTION_PASS("amdgpu-lower-kernel-attributes",
- AMDGPULowerKernelAttributesPass())
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
AMDGPUPromoteAllocaToVectorPass(*this))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index d25b22b2b96dc..86b6e8b878ba1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -582,7 +582,6 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
- initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
@@ -874,8 +873,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPipelineEarlySimplificationEPCallback(
- [](ModulePassManager &PM, OptimizationLevel Level,
- ThinOrFullLTOPhase Phase) {
+ [this](ModulePassManager &PM, OptimizationLevel Level,
+ ThinOrFullLTOPhase Phase) {
if (!isLTOPreLink(Phase)) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
@@ -898,6 +897,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
+
+ if (!isLTOPreLink(Phase))
+ if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
+ AMDGPUAttributorOptions Opts;
+ PM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
+ }
});
PB.registerPeepholeEPCallback(
@@ -931,10 +936,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
// but before SROA to increase SROA opportunities.
FPM.addPass(InferAddressSpacesPass());
- // This should run after inlining to have any chance of doing
- // anything, and before other cleanup optimizations.
- FPM.addPass(AMDGPULowerKernelAttributesPass());
-
if (Level != OptimizationLevel::O0) {
// Promote alloca to vector before SROA and loop unroll. If we
// manage to eliminate allocas before unroll we may choose to unroll
@@ -945,20 +946,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
});
- // FIXME: Why is AMDGPUAttributor not in CGSCC?
- PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
- OptimizationLevel Level,
- ThinOrFullLTOPhase Phase) {
- if (Level != OptimizationLevel::O0) {
- if (!isLTOPreLink(Phase)) {
- if (EnableAMDGPUAttributor && getTargetTriple().isAMDGCN()) {
- AMDGPUAttributorOptions Opts;
- MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
- }
- }
- }
- });
-
PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// When we are using -fgpu-rdc, we can only run accelerator code
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 782cbfa76e6e9..d85852beb803f 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -75,7 +75,6 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerBufferFatPointers.cpp
AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
- AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
AMDGPUPrepareAGPRAlloc.cpp
AMDGPULowerExecSync.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
index 9064292129928..d8b80626f1974 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-max-num-workgroups-load-annotate.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-lower-kernel-attributes %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
define i32 @use_grid_size_x_max_num_workgroups() #0 {
; CHECK-LABEL: define i32 @use_grid_size_x_max_num_workgroups(
@@ -111,10 +111,10 @@ attributes #3 = { "amdgpu-max-num-workgroups"="0,42,89" }
!0 = !{i32 0, i32 -1}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-max-num-workgroups"="36,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-max-num-workgroups"="4294967294,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-max-num-workgroups"="4294967295,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-max-num-workgroups"="0,42,89" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; CHECK: [[RNG0]] = !{i32 1, i32 37}
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
index 25e43a0f332c6..914658031f12e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-block-count.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine %s | FileCheck %s
define i32 @num_blocks_x() {
-; CHECK-LABEL: define i32 @num_blocks_x() {
+; CHECK-LABEL: define i32 @num_blocks_x(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0:![0-9]+]], !noundef [[META0]]
@@ -21,7 +22,8 @@ entry:
}
define i32 @num_blocks_y() {
-; CHECK-LABEL: define i32 @num_blocks_y() {
+; CHECK-LABEL: define i32 @num_blocks_y(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 4
@@ -41,7 +43,8 @@ entry:
}
define i32 @num_blocks_z() {
-; CHECK-LABEL: define i32 @num_blocks_z() {
+; CHECK-LABEL: define i32 @num_blocks_z(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG]], i64 8
@@ -62,7 +65,7 @@ entry:
define i32 @num_blocks(i32 %dim) {
; CHECK-LABEL: define i32 @num_blocks(
-; CHECK-SAME: i32 [[DIM:%.*]]) {
+; CHECK-SAME: i32 [[DIM:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP1:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: switch i32 [[DIM]], label %[[DEFAULT:.*]] [
@@ -131,7 +134,8 @@ exit:
}
define i64 @larger() {
-; CHECK-LABEL: define i64 @larger() {
+; CHECK-LABEL: define i64 @larger(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[GRID_SIZE_X:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
@@ -152,7 +156,8 @@ entry:
}
define i32 @bad_offset() {
-; CHECK-LABEL: define i32 @bad_offset() {
+; CHECK-LABEL: define i32 @bad_offset(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[D_GEP_Y:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
@@ -177,7 +182,8 @@ entry:
}
define i32 @dangling() {
-; CHECK-LABEL: define i32 @dangling() {
+; CHECK-LABEL: define i32 @dangling(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -196,7 +202,8 @@ entry:
}
define i32 @wrong_cast() {
-; CHECK-LABEL: define i32 @wrong_cast() {
+; CHECK-LABEL: define i32 @wrong_cast(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -221,7 +228,8 @@ entry:
}
define i32 @wrong_size() {
-; CHECK-LABEL: define i32 @wrong_size() {
+; CHECK-LABEL: define i32 @wrong_size(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -246,7 +254,8 @@ entry:
}
define i32 @wrong_intrinsic() {
-; CHECK-LABEL: define i32 @wrong_intrinsic() {
+; CHECK-LABEL: define i32 @wrong_intrinsic(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 16
@@ -271,7 +280,8 @@ entry:
}
define i16 @empty_use() {
-; CHECK-LABEL: define i16 @empty_use() {
+; CHECK-LABEL: define i16 @empty_use(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[DISPATCH:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK-NEXT: [[D_GEP_X:%.*]] = getelementptr i8, ptr addrspace(4) [[DISPATCH]], i64 12
@@ -296,7 +306,8 @@ entry:
}
define i32 @multiple_use() {
-; CHECK-LABEL: define i32 @multiple_use() {
+; CHECK-LABEL: define i32 @multiple_use(
+; CHECK-SAME: ) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[IMPLICITARG:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) [[IMPLICITARG]], align 4, !invariant.load [[META0]], !noundef [[META0]]
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 3563e737f5520..1fa939977fc7e 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; Function Attrs: mustprogress nofree norecurse nosync nounwind readnone willreturn
define amdgpu_kernel void @get_local_size_x(ptr addrspace(1) %out) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
index 8c4bd4e882ac6..aa8feb59bbbda 100644
--- a/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/reqd-work-group-size.ll
@@ -1,5 +1,5 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-lower-kernel-attributes,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=amdgpu-attributor,instcombine,infer-alignment %s | FileCheck -enable-var-scope %s
; CHECK-LABEL: @invalid_reqd_work_group_size(
; CHECK: load i16,
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 6a88be6e55859..d655306dee19d 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -826,5 +826,5 @@ entry:
; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
-; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) "uniform-work-group-size"="false" }
+; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) }
attributes #0 = { nounwind }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index d078403135963..d5cc5be2b6aa9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -165,7 +165,6 @@ static_library("LLVMAMDGPUCodeGen") {
"AMDGPULowerExecSync.cpp",
"AMDGPULowerIntrinsics.cpp",
"AMDGPULowerKernelArguments.cpp",
- "AMDGPULowerKernelAttributes.cpp",
"AMDGPULowerModuleLDSPass.cpp",
"AMDGPULowerVGPREncoding.cpp",
"AMDGPUMCInstLower.cpp",
>From c61763845288ad74246466f167dff32154057c75 Mon Sep 17 00:00:00 2001
From: Yoonseo Choi <yoonchoi at amd.com>
Date: Thu, 22 Jan 2026 17:46:00 -0600
Subject: [PATCH 2/2] Add missed change on a test
---
.../amdgcnspirv-uses-amdgpu-abi.cpp | 44 +++++++++++--------
1 file changed, 26 insertions(+), 18 deletions(-)
diff --git a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
index 8f92d1fed1f9f..b6645409722aa 100644
--- a/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
+++ b/clang/test/CodeGenHIP/amdgcnspirv-uses-amdgpu-abi.cpp
@@ -81,7 +81,7 @@ __global__ void k4(SingleElement) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local amdgpu_kernel void @_Z2k55ByRef(
-// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// AMDGPU-SAME: ptr addrspace(4) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -117,7 +117,7 @@ __global__ void k7(unsigned*) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f0s(
-// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+// AMDGPU-SAME: i16 noundef signext [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -129,7 +129,7 @@ __device__ void f0(short) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f1j(
-// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -141,7 +141,7 @@ __device__ void f1(unsigned) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f2d(
-// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: double noundef [[TMP0:%.*]]) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -153,7 +153,7 @@ __device__ void f2(double) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f311Transparent(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -165,7 +165,7 @@ __device__ void f3(Transparent) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f413SingleElement(
-// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: i32 [[DOTCOERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -177,7 +177,7 @@ __device__ void f4(SingleElement) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f55ByRef(
-// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ptr addrspace(5) noundef readnone byref([[STRUCT_BYREF:%.*]]) align 4 captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -189,7 +189,7 @@ __device__ void f5(ByRef) { }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z2f6Dv1_jDv2_jDv3_jDv4_j(
-// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: <1 x i32> noundef [[TMP0:%.*]], <2 x i32> noundef [[TMP1:%.*]], <3 x i32> noundef [[TMP2:%.*]], <4 x i32> noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret void
//
@@ -201,7 +201,7 @@ __device__ void f6(V1, V2, V3, V4) { }
// AMDGCNSPIRV-NEXT: ret i16 0
//
// AMDGPU-LABEL: define dso_local noundef signext i16 @_Z2f7v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret i16 0
//
@@ -213,7 +213,7 @@ __device__ short f7() { return 0; }
// AMDGCNSPIRV-NEXT: ret i32 0
//
// AMDGPU-LABEL: define dso_local noundef i32 @_Z2f8v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret i32 0
//
@@ -225,7 +225,7 @@ __device__ unsigned f8() { return 0; }
// AMDGCNSPIRV-NEXT: ret double 0.000000e+00
//
// AMDGPU-LABEL: define dso_local noundef double @_Z2f9v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret double 0.000000e+00
//
@@ -237,7 +237,7 @@ __device__ double f9() { return 0.; }
// AMDGCNSPIRV-NEXT: ret i32 0
//
// AMDGPU-LABEL: define dso_local noundef i32 @_Z3f10v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret i32 0
//
@@ -249,7 +249,7 @@ __device__ Transparent f10() { return {}; }
// AMDGCNSPIRV-NEXT: ret i32 0
//
// AMDGPU-LABEL: define dso_local noundef i32 @_Z3f11v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret i32 0
//
@@ -262,7 +262,7 @@ __device__ SingleElement f11() { return {}; }
// AMDGCNSPIRV-NEXT: ret void
//
// AMDGPU-LABEL: define dso_local void @_Z3f12v(
-// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
+// AMDGPU-SAME: ptr addrspace(5) dead_on_unwind noalias writable writeonly sret([[STRUCT_BYREF:%.*]]) align 4 captures(none) initializes((0, 68)) [[AGG_RESULT:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef align 4 dereferenceable(68) [[AGG_RESULT]], i8 0, i64 68, i1 false)
// AMDGPU-NEXT: ret void
@@ -275,7 +275,7 @@ __device__ ByRef f12() { return {}; }
// AMDGCNSPIRV-NEXT: ret <1 x i32> zeroinitializer
//
// AMDGPU-LABEL: define dso_local noundef <1 x i32> @_Z3f13v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret <1 x i32> zeroinitializer
//
@@ -287,7 +287,7 @@ __device__ V1 f13() { return {}; }
// AMDGCNSPIRV-NEXT: ret <2 x i32> zeroinitializer
//
// AMDGPU-LABEL: define dso_local noundef <2 x i32> @_Z3f14v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret <2 x i32> zeroinitializer
//
@@ -299,7 +299,7 @@ __device__ V2 f14() { return {}; }
// AMDGCNSPIRV-NEXT: ret <3 x i32> zeroinitializer
//
// AMDGPU-LABEL: define dso_local noundef <3 x i32> @_Z3f15v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret <3 x i32> zeroinitializer
//
@@ -311,7 +311,7 @@ __device__ V3 f15() { return {}; }
// AMDGCNSPIRV-NEXT: ret <4 x i32> zeroinitializer
//
// AMDGPU-LABEL: define dso_local noundef <4 x i32> @_Z3f16v(
-// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR1]] {
+// AMDGPU-SAME: ) local_unnamed_addr #[[ATTR2]] {
// AMDGPU-NEXT: [[ENTRY:.*:]]
// AMDGPU-NEXT: ret <4 x i32> zeroinitializer
//
@@ -319,3 +319,11 @@ __device__ V4 f16() { return {}; }
//.
// AMDGCNSPIRV: [[META9]] = !{i32 1024, i32 1, i32 1}
//.
+
+// For recording purpose of AMDGPU
+// attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" }
+// attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="true" }
+// attributes #2 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #3 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #4 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx906" "uniform-work-group-size"="false" }
+// attributes #5 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
More information about the cfe-commits
mailing list