[llvm] AMDGPU: Infer no-agpr usage in AMDGPUAttributor (PR #85948)

Wed Mar 20 08:38:00 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

SIMachineFunctionInfo has a scan  of the function body for inline asm
which may use AGPRs, or callees in SIMachineFunctionInfo. Move this
into the attributor, so it actually works interprocedurally.
    
Could probably avoid most of the test churn if this bothered to avoid
adding this on subtargets without AGPRs. We should also probably
try to delete the MIR scan in usesAGPRs but it seems to be trickier
to eliminate.



---

Patch is 143.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/85948.diff


26 Files Affected:

- (modified) llvm/docs/AMDGPUUsage.rst (+5) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+94-2) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (+1-29) 
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll (+3-3) 
- (added) llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll (+255) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll (+22-22) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll (+13-13) 
- (modified) llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll (+10-9) 
- (modified) llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll (+15-15) 
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll (+10-10) 
- (modified) llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll (+9-9) 
- (modified) llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll (+22-22) 
- (modified) llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll (+1) 


``````````diff

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 29ea5005c0c409..6e6d6b15714809 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1454,6 +1454,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
                                              the three numbers are >= 1.
 
+     "amdgpu-no-agpr"                        Indicates the function will not require allocating AGPRs. This is only
+                                             relevant on subtargets with AGPRs. The behavior is undefined if a
+                                             function which requires AGPRs is reached through any function marked
+                                             with this attribute.
+
      ======================================= ==========================================================
 
 Calling Conventions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d7f5110427ecec..9bd30458bc0a7e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -918,6 +918,96 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
 }
 
+static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
+  for (const auto &CI : IA->ParseConstraints()) {
+    for (StringRef Code : CI.Codes) {
+      Code.consume_front("{");
+      if (Code.starts_with("a"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+struct AAAMDGPUNoAGPR
+    : public IRAttribute<Attribute::NoUnwind,
+                         StateWrapper<BooleanState, AbstractAttribute>,
+                         AAAMDGPUNoAGPR> {
+  AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+  static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
+                                           Attributor &A) {
+    if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+      return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
+    llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
+  }
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    if (F->hasFnAttribute("amdgpu-no-agpr"))
+      indicateOptimisticFixpoint();
+  }
+
+  const std::string getAsStr(Attributor *A) const override {
+    return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
+  }
+
+  void trackStatistics() const override {}
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
+
+    auto CheckForNoAGPRs = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      const Value *CalleeOp = CB.getCalledOperand();
+      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      if (!Callee) {
+        if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
+          return !inlineAsmUsesAGPRs(IA);
+        return false;
+      }
+
+      // Some intrinsics may use AGPRs, but if we have a choice, we are not
+      // required to use AGPRs.
+      if (Callee->isIntrinsic())
+        return true;
+
+      // TODO: Handle callsite attributes
+      const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+      return CalleeInfo && CalleeInfo->getAssumed();
+    };
+
+    bool UsedAssumedInformation = false;
+    if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
+                                           UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!getAssumed())
+      return ChangeStatus::UNCHANGED;
+    LLVMContext &Ctx = getAssociatedFunction()->getContext();
+    return A.manifestAttrs(getIRPosition(),
+                           {Attribute::get(Ctx, "amdgpu-no-agpr")});
+  }
+
+  const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDGPUNoAGPRs
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  static const char ID;
+};
+
+const char AAAMDGPUNoAGPR::ID = 0;
+
 static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   for (unsigned I = 0;
@@ -946,8 +1036,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
-       &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
+       &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
+       &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+       &AAUnderlyingObjects::ID});
 
   AttributorConfig AC(CGUpdater);
   AC.Allowed = &Allowed;
@@ -963,6 +1054,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
     if (!F.isIntrinsic()) {
       A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
       A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+      A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(F));
       CallingConv::ID CC = F.getCallingConv();
       if (!AMDGPU::isEntryFunctionCC(CC)) {
         A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 2569f40fec0e48..12433dc83c4892 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -748,35 +748,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
 }
 
 bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
-      const auto *CB = dyn_cast<CallBase>(&I);
-      if (!CB)
-        continue;
-
-      if (CB->isInlineAsm()) {
-        const InlineAsm *IA = dyn_cast<InlineAsm>(CB->getCalledOperand());
-        for (const auto &CI : IA->ParseConstraints()) {
-          for (StringRef Code : CI.Codes) {
-            Code.consume_front("{");
-            if (Code.starts_with("a"))
-              return true;
-          }
-        }
-        continue;
-      }
-
-      const Function *Callee =
-          dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
-      if (!Callee)
-        return true;
-
-      if (Callee->getIntrinsicID() == Intrinsic::not_intrinsic)
-        return true;
-    }
-  }
-
-  return false;
+  return !F.hasFnAttribute("amdgpu-no-agpr");
 }
 
 bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 66034af5c351fd..cff9ce05066793 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
new file mode 100644
index 00000000000000..33b1cc65dc5699
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call i32 asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_non_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "v"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  ret void
+}
+
+define void @func_uses_asm_virtreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+define void @func_uses_asm_physreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  ret void
+}
+
+define void @func_uses_asm_physreg_agpr_tuple() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  ret void
+}
+
+declare void @unknown()
+
+define amdgpu_kernel void @kernel_calls_extern() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK-NEXT:    call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @unknown() #0
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    call void [[INDIRECT]]()
+; CHECK-NEXT:    ret void
+;
+  call void %indirect()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT:    ret void
+;
+  call void %indirect() #0
+  ret void
+}
+
+define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    ret void
+;
+  call void @func_uses_asm_physreg_agpr()
+  ret void
+}
+
+define void @empty() {
+; CHECK-LABEL: define void @empty(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @also_empty() {
+; CHECK-LABEL: define void @also_empty(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_empty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    ret void
+;
+  call void @empty()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    ret void
+;
+  call void @empty()
+  call void @func_uses_asm_physreg_agpr()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false)
+  ret void
+}
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128
+; CHECK-NEXT:    ret void
+;
+  %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+  store <32 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
+; CHECK-NEXT:    call void [[FPTR]]()
+; CHECK-NEXT:    ret void
+;
+  %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+  call void %fptr()
+  ret void
+}
+
+
+attributes #0 = { "amdgpu-no-agpr" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/85948