[llvm] AMDGPU: Drop amdgpu-no-lds-kernel-id attribute in LDS lowering (PR #71481)

Mon Nov 6 19:41:25 PST 2023

https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/71481

This is in preparation for moving the run of AMDGPUAttributor earlier.
Currently it infers the lack of the corresponding intrinsic calls,
so if we introduce new ones we need to remove the attribute from any
possible transitive callers. This is more conservative than necessary,
we could try to identify specific subgraphs where LDS globals are not
used.

Other options include teaching the attributor to avoid adding it in cases
where the lowering may choose the table, but this seems more complex.
Alternatively could add a second run which doesn't seem worth it.

>From 0d31d8c6c186012abc659427d72e73fc1a1b36f2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 6 Nov 2023 12:09:28 +0900
Subject: [PATCH 1/3] AMDGPU: Port AMDGPUAttributor to new pass manager

---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  14 ++-
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 116 ++++++++++--------
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   8 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |   2 +-
 .../annotate-existing-abi-attributes.ll       |   2 +-
 .../annotate-kernel-features-hsa-call.ll      |   2 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   2 +-
 .../AMDGPU/annotate-kernel-features.ll        |   2 +-
 .../AMDGPU/attributor-loop-issue-58639.ll     |   2 +-
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |   2 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |   2 +-
 .../AMDGPU/implicitarg-offset-attributes.ll   |   4 +-
 .../AMDGPU/pal-simple-indirect-call.ll        |   2 +-
 .../AMDGPU/preload-kernargs-inreg-hints.ll    |  10 +-
 .../AMDGPU/propagate-flat-work-group-size.ll  |   1 +
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   2 +-
 .../AMDGPU/recursive_global_initializer.ll    |   2 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |   2 +-
 .../uniform-work-group-attribute-missing.ll   |   2 +-
 .../AMDGPU/uniform-work-group-multistep.ll    |   2 +-
 ...niform-work-group-nested-function-calls.ll |   2 +-
 ...ork-group-prevent-attribute-propagation.ll |   2 +-
 .../uniform-work-group-propagate-attribute.ll |   2 +-
 .../uniform-work-group-recursion-test.ll      |   2 +-
 .../CodeGen/AMDGPU/uniform-work-group-test.ll |   2 +-
 25 files changed, 112 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 2c29710f8c8cb46..403014db56171ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -17,6 +17,7 @@
 namespace llvm {
 
 class AMDGPUTargetMachine;
+class GCNTargetMachine;
 class TargetMachine;
 
 // GlobalISel passes
@@ -86,8 +87,8 @@ extern char &AMDGPUMachineCFGStructurizerID;
 void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
 
 Pass *createAMDGPUAnnotateKernelFeaturesPass();
-Pass *createAMDGPUAttributorPass();
-void initializeAMDGPUAttributorPass(PassRegistry &);
+Pass *createAMDGPUAttributorLegacyPass();
+void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
@@ -262,6 +263,15 @@ class AMDGPULowerKernelArgumentsPass
   PreservedAnalyses run(Function &, FunctionAnalysisManager &);
 };
 
+class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
+private:
+  TargetMachine &TM;
+
+public:
+  AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){};
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
 ModulePass *createAMDGPUPrintfRuntimeBinding();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d7dc37066b1fd38..5fd9e571282dbcb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -933,9 +933,53 @@ static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
   }
 }
 
-class AMDGPUAttributor : public ModulePass {
+static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
+  SetVector<Function *> Functions;
+  for (Function &F : M) {
+    if (!F.isIntrinsic())
+      Functions.insert(&F);
+  }
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+  AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
+  DenseSet<const char *> Allowed(
+      {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
+       &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
+       &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
+       &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
+
+  AttributorConfig AC(CGUpdater);
+  AC.Allowed = &Allowed;
+  AC.IsModulePass = true;
+  AC.DefaultInitializeLiveInternals = false;
+  AC.IPOAmendableCB = [](const Function &F) {
+    return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+  };
+
+  Attributor A(Functions, InfoCache, AC);
+
+  for (Function &F : M) {
+    if (!F.isIntrinsic()) {
+      A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+      A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+      CallingConv::ID CC = F.getCallingConv();
+      if (!AMDGPU::isEntryFunctionCC(CC)) {
+        A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+        A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
+      } else if (CC == CallingConv::AMDGPU_KERNEL) {
+        addPreloadKernArgHint(F, TM);
+      }
+    }
+  }
+
+  ChangeStatus Change = A.run();
+  return Change == ChangeStatus::CHANGED;
+}
+
+class AMDGPUAttributorLegacy : public ModulePass {
 public:
-  AMDGPUAttributor() : ModulePass(ID) {}
+  AMDGPUAttributorLegacy() : ModulePass(ID) {}
 
   /// doInitialization - Virtual method overridden by subclasses to do
   /// any necessary initialization before any pass is run.
@@ -949,48 +993,8 @@ class AMDGPUAttributor : public ModulePass {
   }
 
   bool runOnModule(Module &M) override {
-    SetVector<Function *> Functions;
     AnalysisGetter AG(this);
-    for (Function &F : M) {
-      if (!F.isIntrinsic())
-        Functions.insert(&F);
-    }
-
-    CallGraphUpdater CGUpdater;
-    BumpPtrAllocator Allocator;
-    AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
-    DenseSet<const char *> Allowed(
-        {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
-         &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-         &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
-         &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
-
-    AttributorConfig AC(CGUpdater);
-    AC.Allowed = &Allowed;
-    AC.IsModulePass = true;
-    AC.DefaultInitializeLiveInternals = false;
-    AC.IPOAmendableCB = [](const Function &F) {
-      return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
-    };
-
-    Attributor A(Functions, InfoCache, AC);
-
-    for (Function &F : M) {
-      if (!F.isIntrinsic()) {
-        A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
-        A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
-        CallingConv::ID CC = F.getCallingConv();
-        if (!AMDGPU::isEntryFunctionCC(CC)) {
-          A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
-          A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
-        } else if (CC == CallingConv::AMDGPU_KERNEL) {
-          addPreloadKernArgHint(F, *TM);
-        }
-      }
-    }
-
-    ChangeStatus Change = A.run();
-    return Change == ChangeStatus::CHANGED;
+    return runImpl(M, AG, *TM);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1003,11 +1007,25 @@ class AMDGPUAttributor : public ModulePass {
 };
 } // namespace
 
-char AMDGPUAttributor::ID = 0;
+PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
+                                                  ModuleAnalysisManager &AM) {
 
-Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
-INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
-                      false)
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  // TODO: Probably preserves CFG
+  return runImpl(M, AG, TM) ? PreservedAnalyses::none()
+                            : PreservedAnalyses::all();
+}
+
+char AMDGPUAttributorLegacy::ID = 0;
+
+Pass *llvm::createAMDGPUAttributorLegacyPass() {
+  return new AMDGPUAttributorLegacy();
+}
+INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
-INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
-                    false)
+INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
+                    false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 375df27206f7b41..f9095e684515a72 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -381,7 +381,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
   initializeAMDGPUAlwaysInlinePass(*PR);
-  initializeAMDGPUAttributorPass(*PR);
+  initializeAMDGPUAttributorLegacyPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
   initializeAMDGPUArgumentUsageInfoPass(*PR);
@@ -610,6 +610,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
   PB.registerPipelineParsingCallback(
       [this](StringRef PassName, ModulePassManager &PM,
              ArrayRef<PassBuilder::PipelineElement>) {
+        if (PassName == "amdgpu-attributor") {
+          PM.addPass(AMDGPUAttributorPass(*this));
+          return true;
+        }
         if (PassName == "amdgpu-unify-metadata") {
           PM.addPass(AMDGPUUnifyMetadataPass());
           return true;
@@ -1021,7 +1025,7 @@ void AMDGPUPassConfig::addIRPasses() {
   // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
   // after their introduction
   if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addPass(createAMDGPUAttributorPass());
+    addPass(createAMDGPUAttributorLegacyPass());
 
   if (TM.getOptLevel() > CodeGenOptLevel::None)
     addPass(createInferAddressSpacesPass());
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index c4e40d46e3179b0..089799383294ecf 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
 
 declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
index 84d97a86e5e5df8..28722021e0448f2 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-existing-abi-attributes.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor %s | FileCheck %s
 
 ; Check handling for pre-existing attributes on function declarations
 
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index a516d9ecdaf0e4c..1f90c0d03a85661 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s
 
 ; TODO: The test contains UB which is refined by the Attributor and should be removed.
 
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index aa3243404056d94..9e5df73d69a07b9 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 24339d526aa0338..6c5e58c74033945 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s
 
 declare i32 @llvm.r600.read.tgid.x() #0
 declare i32 @llvm.r600.read.tgid.y() #0
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
index 60da0445927743a..2b9f579e6a1839e 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-loop-issue-58639.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
 %0 = type { ptr, ptr }
 
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index c20df45f10a2950..0c034192869b184 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor < %s | FileCheck %s
 
 define internal void @indirect() {
 ; CHECK-LABEL: define {{[^@]+}}@indirect
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index fde1ca5ce02b741..0069370cc9721ca 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features  %s | FileCheck -check-prefix=AKF_GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
 
 define internal void @indirect() {
 ; AKF_GCN-LABEL: define {{[^@]+}}@indirect() {
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index c355023270c19e4..d5590754d78bc70 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor | FileCheck -check-prefixes=CHECK,V4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor | FileCheck -check-prefixes=CHECK,V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor | FileCheck -check-prefixes=CHECK,V4 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor | FileCheck -check-prefixes=CHECK,V5 %s
 
 declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
 
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index c4af5f258243e60..2e9f09ad41813d9 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; Check that no attributes are added to graphics functions
 ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features  %s | FileCheck -check-prefixes=AKF_GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s
 
 ; Check that it doesn't crash
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
index 1238fa8c49a928b..e7488e059ee9df8 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=NO-PRELOAD %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=1 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-1 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=3 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-3 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=16 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-16 %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-kernarg-preload-count=20 -passes=amdgpu-attributor -S < %s | FileCheck -check-prefix=PRELOAD-20 %s
 
 define amdgpu_kernel void @test_preload_hint_kernel_1(ptr %0) #0 {
 ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index 820862bba7b1400..d92ba7774bd3a19 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
 ; Check propagation of amdgpu-flat-work-group-size attribute.
 
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 3a6421131fcae1d..2df219bd0401e88 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 2
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck %s
 
 ; Check propagation of amdgpu-flat-work-group-size attribute.
 
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index d1ffa8262d253b3..eaef63bbfc3c9dc 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor -S %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor -S %s | FileCheck %s
 
 %struct.foo = type { %struct.pluto, ptr, i64 }
 %struct.pluto = type { [512 x i8], ptr }
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index babc9c3858f901d..dcc90c0dcd407e7 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features  %s | FileCheck -check-prefix=AKF_GCN %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
 
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index be7171cbb359ff6..8d5dc7943164f76 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd- -passes=amdgpu-attributor %s | FileCheck %s
 
 ; If the kernel does not have the uniform-work-group-attribute, set both callee and caller as false
 ; We write to a global so that the attributor don't deletes the function.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index f6cb99956e7e2a6..7a6f82d589e6838 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-attributor < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck %s
 
 ;.
 ; CHECK: @[[G1:[a-zA-Z0-9_$"\\.-]+]] = global ptr null
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 052d5658ad8b0a9..c04154c7c23f2fb 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd- -passes=amdgpu-attributor < %s | FileCheck %s
 
 ; Test to verify if the attribute gets propagated across nested function calls
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index d481a5245f21f55..2d5ff045d12e5a1 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd- -passes=amdgpu-attributor < %s | FileCheck %s
 
 ; Function added to prevent attributor from deleting call sites.
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index 0141229830e4580..e8bf6fc8321b3af 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd- -passes=amdgpu-attributor %s | FileCheck %s
 
 @x = global i32 0
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 7ba7566506ca8cc..473eea4eedceca8 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd- -amdgpu-attributor %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd- -passes=amdgpu-attributor %s | FileCheck %s
 
 ; Test to ensure recursive functions exhibit proper behaviour
 ; Test to generate fibonacci numbers
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 0c8878567eb1d02..221f1a11676fc03 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-attributor %s | FileCheck -allow-unused-prefixes %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -allow-unused-prefixes %s
 
 @x = global i32 0
 ;.

>From 3fcd32a4f42e06f0e0d1b68b7b0efac4ab4a4e07 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 6 Nov 2023 17:33:49 +0900
Subject: [PATCH 2/3] AMDGPU: Add test for LDS module lowering removing
 amdgpu-no-lds-kernel-id

Prepare to move where AMDGPUAttributor runs. Currently it runs after
LDS lowering, where the lack of kernel ID intrinsics is inferred.
---
 .../AMDGPU/remove-no-kernel-id-attribute.ll   | 160 ++++++++++++++++++
 1 file changed, 160 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll

diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
new file mode 100644
index 000000000000000..7c6d9eec2968551
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals --version 3
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-attributor,amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=table | FileCheck -check-prefixes=CHECK,TABLE %s
+
+; FIXME: Work around update_test_checks bug in constant expression handling by manually deleting part of the last global pattern
+
+ at function.lds = addrspace(3) global i16 poison
+ at other.kernel.lds = addrspace(3) global i16 poison
+
+;.
+; CHECK: @[[LLVM_AMDGCN_KERNEL_K0_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K0_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
+; CHECK: @[[LLVM_AMDGCN_KERNEL_K1_F0_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_K1_F0_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
+; CHECK: @[[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [[LLVM_AMDGCN_KERNEL_KERNEL_LDS_LDS_T:%.*]] poison, align 2, !absolute_symbol !0
+; CHECK: @[[LLVM_AMDGCN_LDS_OFFSET_TABLE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(4) constant [2 x [1 x i32]]
+;.
+define internal void @lds_use_through_indirect() {
+; CHECK-LABEL: define internal void @lds_use_through_indirect(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS2]], align 4
+; CHECK-NEXT:    [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) [[FUNCTION_LDS3]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 7
+; CHECK-NEXT:    [[FUNCTION_LDS:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS]], align 4
+; CHECK-NEXT:    [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT:    store i16 [[MUL]], ptr addrspace(3) [[FUNCTION_LDS1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %ld = load i16, ptr addrspace(3) @function.lds
+  %mul = mul i16 %ld, 7
+  store i16 %mul, ptr addrspace(3) @function.lds
+  ret void
+}
+
+define internal void @indirectly_called() {
+; CHECK-LABEL: define internal void @indirectly_called(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    store volatile ptr @indirectly_called, ptr addrspace(1) null, align 8
+; CHECK-NEXT:    call void @lds_use_through_indirect()
+; CHECK-NEXT:    ret void
+;
+  store volatile ptr @indirectly_called, ptr addrspace(1) null
+  call void @lds_use_through_indirect()
+  ret void
+}
+
+define internal void @calls_indirectly_called() {
+; CHECK-LABEL: define internal void @calls_indirectly_called(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @indirectly_called()
+; CHECK-NEXT:    ret void
+;
+  call void @indirectly_called()
+  ret void
+}
+
+; TODO: Should still have "amdgpu-no-lds-kernel-id" attached
+define internal void @no_lds_global_use_leaf() {
+; CHECK-LABEL: define internal void @no_lds_global_use_leaf(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+; Should have "amdgpu-no-lds-kernel-id" stripped
+define internal void @f0() {
+; CHECK-LABEL: define internal void @f0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[FUNCTION_LDS2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS2]], align 4
+; CHECK-NEXT:    [[FUNCTION_LDS3:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) [[FUNCTION_LDS3]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 4
+; CHECK-NEXT:    [[FUNCTION_LDS:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[FUNCTION_LDS]], align 4
+; CHECK-NEXT:    [[FUNCTION_LDS1:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT:    store i16 [[MUL]], ptr addrspace(3) [[FUNCTION_LDS1]], align 2
+; CHECK-NEXT:    call void @no_lds_global_use_leaf()
+; CHECK-NEXT:    ret void
+;
+  %ld = load i16, ptr addrspace(3) @function.lds
+  %mul = mul i16 %ld, 4
+  store i16 %mul, ptr addrspace(3) @function.lds
+  call void @no_lds_global_use_leaf()
+  ret void
+}
+
+; Should have "amdgpu-no-lds-kernel-id" stripped
+define internal void @f0_transitive() {
+; CHECK-LABEL: define internal void @f0_transitive(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @f0()
+; CHECK-NEXT:    call void @no_lds_global_use_leaf()
+; CHECK-NEXT:    ret void
+;
+  call void @f0()
+  call void @no_lds_global_use_leaf()
+  ret void
+}
+
+define amdgpu_kernel void @k0_f0() {
+; CHECK-LABEL: define amdgpu_kernel void @k0_f0(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !1 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ]
+; CHECK-NEXT:    call void @f0_transitive()
+; CHECK-NEXT:    ret void
+;
+  call void @f0_transitive()
+  ret void
+}
+
+define amdgpu_kernel void @k1_f0() {
+; CHECK-LABEL: define amdgpu_kernel void @k1_f0(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id !2 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ]
+; CHECK-NEXT:    call void @f0_transitive()
+; CHECK-NEXT:    [[FPTR:%.*]] = load volatile ptr, ptr addrspace(1) null, align 8
+; CHECK-NEXT:    call void [[FPTR]]()
+; CHECK-NEXT:    call void @calls_indirectly_called()
+; CHECK-NEXT:    ret void
+;
+  call void @f0_transitive()
+  %fptr = load volatile ptr, ptr addrspace(1) null
+  call void %fptr()
+  call void @calls_indirectly_called()
+  ret void
+}
+
+; Should still have "amdgpu-no-lds-kernel-id" attached
+define amdgpu_kernel void @kernel_lds() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_lds(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    [[LD:%.*]] = load i16, ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds.lds, align 2
+; CHECK-NEXT:    [[MUL:%.*]] = mul i16 [[LD]], 42
+; CHECK-NEXT:    store i16 [[MUL]], ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds.lds, align 2
+; CHECK-NEXT:    ret void
+;
+  %ld = load i16, ptr addrspace(3) @other.kernel.lds
+  %mul = mul i16 %ld, 42
+  store i16 %mul, ptr addrspace(3) @other.kernel.lds
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="2" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
+; CHECK: [[META1:![0-9]+]] = !{i32 0}
+; CHECK: [[META2:![0-9]+]] = !{i32 1}
+;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; TABLE: {{.*}}

>From d274f3c59e165f885931eadaa753a70d33cb2ad9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 6 Nov 2023 17:18:54 +0900
Subject: [PATCH 3/3] AMDGPU: Drop amdgpu-no-lds-kernel-id attribute in LDS
 lowering

This is in preparation for moving the run of AMDGPUAttributor earlier.
Currently it infers the lack of the corresponding intrinsic calls,
so if we introduce new ones we need to remove the attribute from any
possible transitive callers. This is more conservative than necessary,
we could try to identify specific subgraphs where LDS globals are not
used.
---
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       | 43 +++++++++++++++++++
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |  6 +--
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index efe40e52065426d..64d72b6ff50363b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -1028,6 +1028,42 @@ class AMDGPULowerModuleLDS {
     return N;
   }
 
+  /// Strip "amdgpu-no-lds-kernel-id" from any functions where we may have
+  /// introduced its use. If AMDGPUAttributor ran prior to the pass, we inferred
+  /// the lack of llvm.amdgcn.lds.kernel.id calls.
+  void removeNoLdsKernelIdFromReachable(CallGraph &CG, Function *KernelRoot) {
+    KernelRoot->removeFnAttr("amdgpu-no-lds-kernel-id");
+
+    SmallVector<Function *> Tmp({CG[KernelRoot]->getFunction()});
+    if (!Tmp.back())
+      return;
+
+    do {
+      Function *F = Tmp.pop_back_val();
+
+      for (auto &N : *CG[F]) {
+        if (!N.second)
+          continue;
+
+        Function *Callee = N.second->getFunction();
+        if (!Callee) {
+          // If we see any indirect calls, assume nothing about potential
+          // targets.
+          // TODO: This could be refined to possible LDS global users.
+          for (auto &N : *CG.getCallsExternalNode()) {
+            Function *PotentialCallee = N.second->getFunction();
+            Tmp.push_back(PotentialCallee);
+          }
+
+          continue;
+        }
+
+        Callee->removeFnAttr("amdgpu-no-lds-kernel-id");
+        Tmp.push_back(Callee);
+      }
+    } while (!Tmp.empty());
+  }
+
   DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
       Module &M, LDSUsesInfoTy &LDSUsesInfo,
       DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
@@ -1177,6 +1213,13 @@ class AMDGPULowerModuleLDS {
           M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
       replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
                                                LookupTable);
+
+      // Strip amdgpu-no-lds-kernel-id from all functions reachable from the
+      // kernel. We may have inferred this wasn't used prior to the pass.
+      //
+      // TODO: We could filter out subgraphs that do not access LDS globals.
+      for (Function *F : KernelsThatAllocateTableLDS)
+        removeNoLdsKernelIdFromReachable(CG, F);
     }
 
     DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 7c6d9eec2968551..04c21ae8c9556f6 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -144,9 +144,9 @@ define amdgpu_kernel void @kernel_lds() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="2" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }