[llvm] AMDGPU: Figure out required AGPR count for inline asm (PR #150910)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sat Oct 4 03:58:19 PDT 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/150910

>From 9ee5c4c9595fcf8fd360d9ce6c5705196d3041c8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 28 Jul 2025 16:11:28 +0900
Subject: [PATCH 1/4] AMDGPU: Figure out required AGPR count for inline asm

For now just try to compute the minimum number of AGPRs required
to allocate the asm. Leave the attributor changes to turn this
into an integer value for later.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  59 +++++-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       | 199 ++++++++++++++++++
 2 files changed, 251 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 9dd64e0f6b35b..071f6c55b04fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1211,16 +1211,61 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
 }
 
-static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
-  for (const auto &CI : IA->ParseConstraints()) {
+/// Compute the minimum number of AGPRs required to allocate the inline asm.
+static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
+                                             const CallBase &Call) {
+  unsigned ArgNo = 0;
+  unsigned ResNo = 0;
+  unsigned AGPRDefCount = 0;
+  unsigned AGPRUseCount = 0;
+  unsigned MaxPhysReg = 0;
+  const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
+
+  for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    Type *Ty = nullptr;
+    switch (CI.Type) {
+    case InlineAsm::isOutput: {
+      Ty = Call.getType();
+      if (auto *STy = dyn_cast<StructType>(Ty))
+        Ty = STy->getElementType(ResNo);
+      ++ResNo;
+      break;
+    }
+    case InlineAsm::isInput: {
+      Ty = Call.getArgOperand(ArgNo++)->getType();
+      break;
+    }
+    case InlineAsm::isLabel:
+      continue;
+    case InlineAsm::isClobber:
+      // Parse the physical register reference.
+      break;
+    }
+
     for (StringRef Code : CI.Codes) {
-      Code.consume_front("{");
-      if (Code.starts_with("a"))
-        return true;
+      if (Code.starts_with("a")) {
+        // Virtual register, compute number of registers based on the type.
+        //
+        // We ought to be going through TargetLowering to get the number of
+        // registers, but we should avoid the dependence on CodeGen here.
+        unsigned RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
+        if (CI.Type == InlineAsm::isOutput) {
+          AGPRDefCount += RegCount;
+          if (CI.isEarlyClobber)
+            AGPRUseCount += RegCount;
+        } else
+          AGPRUseCount += RegCount;
+      } else {
+        // Physical register reference
+        auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
+        if (Kind == 'a')
+          MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+      }
     }
   }
 
-  return false;
+  unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
+  return std::min(MaxVirtReg + MaxPhysReg, 256u);
 }
 
 // TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1262,7 +1307,7 @@ struct AAAMDGPUNoAGPR
       const Function *Callee = dyn_cast<Function>(CalleeOp);
       if (!Callee) {
         if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
-          return !inlineAsmUsesAGPRs(IA);
+          return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
         return false;
       }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 664dfa21759cf..a67f6cd955612 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -251,6 +251,205 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
   ret void
 }
 
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call {i32, i32} asm sideeffect "; def $0", "=a,=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=v"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(ptr poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call ptr asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call ptr asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call <2 x ptr> asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call {i32, i32} asm sideeffect "; def $0", "={a0},={a[4:5]}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; clobber $0", "~{a4}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; clobber $0", "~{a[10:13]}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; clobber $0", "~{a256}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; clobber $0", "~{a255}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a256}"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call <32 x i32> asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(<32 x i32> poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call <32 x i32> asm sideeffect "; use $0", "=a,a"(<32 x i32> poison)
+  ret void
+}
+
+define amdgpu_kernel void @vreg_use_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(<257 x i32> poison)
+  ret void
+}
+
+define amdgpu_kernel void @vreg_def_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call <257 x i32> asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call <257 x i32> asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @multiple() {
+; CHECK-LABEL: define amdgpu_kernel void @multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call {<16 x i32>, <8 x i32>, <8 x i32>} asm sideeffect "; def $0", "=a,=a,=a,a,a,a"(<4 x i32> splat (i32 0), <8 x i32> splat (i32 1), i64 999)
+  ret void
+}
+
+define amdgpu_kernel void @earlyclobber_0() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call <8 x i32> asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call <8 x i32> asm sideeffect "; def $0", "=&a,a"(i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @earlyclobber_1() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call { <8 x i32>, <16 x i32 > } asm sideeffect "; def $0, $1", "=&a,=&a,a,a"(i32 0, <16 x i32> splat (i32 1))
+  ret void
+}
 
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
 ;.

>From 1769175bf8eafb226e3506118e2385b89409a9f2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 4 Oct 2025 19:34:08 +0900
Subject: [PATCH 2/4] reduce test noise

---
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       | 232 ++++++++++++++----
 1 file changed, 180 insertions(+), 52 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index a67f6cd955612..49bcc9de2b644 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -1,103 +1,166 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
 
+; Shrink result attribute list by preventing use of most attributes.
+define internal void @use_most() {
+; CHECK-LABEL: define internal void @use_most(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [256 x i8], align 1, addrspace(5)
+; CHECK-NEXT:    [[ALLOCA_CAST:%.*]] = addrspacecast ptr addrspace(5) [[ALLOCA]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.cluster.id.x()
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.cluster.id.y()
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.cluster.id.z()
+; CHECK-NEXT:    [[TMP7:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP8:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[IMPLICIT_ARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p4.i64(ptr [[ALLOCA_CAST]], ptr addrspace(4) [[IMPLICIT_ARG_PTR]], i64 256, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca [256 x i8], addrspace(5)
+  %alloca.cast = addrspacecast ptr addrspace(5) %alloca to ptr
+  call i32 @llvm.amdgcn.workitem.id.x()
+  call i32 @llvm.amdgcn.workitem.id.y()
+  call i32 @llvm.amdgcn.workitem.id.z()
+  call i32 @llvm.amdgcn.workgroup.id.x()
+  call i32 @llvm.amdgcn.workgroup.id.y()
+  call i32 @llvm.amdgcn.workgroup.id.z()
+  call i32 @llvm.amdgcn.cluster.id.x()
+  call i32 @llvm.amdgcn.cluster.id.y()
+  call i32 @llvm.amdgcn.cluster.id.z()
+  call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+  call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
+  call i64 @llvm.amdgcn.dispatch.id()
+  call i32 @llvm.amdgcn.lds.kernel.id()
+  %implicit.arg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  call void @llvm.memcpy.p0.p4(ptr %alloca.cast, ptr addrspace(4) %implicit.arg.ptr, i64 256, i1 false)
+  ret void
+}
+
 define amdgpu_kernel void @kernel_uses_asm_virtreg() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "a"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call i32 asm sideeffect "; def $0", "=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_non_agpr_asm() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "v"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_physreg() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  call void @use_most()
   ret void
 }
 
 define void @func_uses_asm_virtreg_agpr() {
 ; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "a"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define void @func_uses_asm_physreg_agpr() {
 ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define void @func_uses_asm_physreg_agpr_tuple() {
 ; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  call void @use_most()
   ret void
 }
 
@@ -105,99 +168,119 @@ declare void @unknown()
 
 define amdgpu_kernel void @kernel_calls_extern() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
-; CHECK-SAME: ) #[[ATTR2]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @unknown() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown() #0
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    call void [[INDIRECT]]()
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
-; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR6]]
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR5]]
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect() #0
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @func_uses_asm_physreg_agpr()
+  call void @use_most()
   ret void
 }
 
 define void @empty() {
 ; CHECK-LABEL: define void @empty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
+  call void @use_most()
   ret void
 }
 
 define void @also_empty() {
 ; CHECK-LABEL: define void @also_empty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_empty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @empty()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void @empty()
 ; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @empty()
   call void @func_uses_asm_physreg_agpr()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic(
-; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false)
+  call void @use_most()
   ret void
 }
 
@@ -205,31 +288,35 @@ declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>
 
 define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
   store <32 x float> %result, ptr addrspace(1) %out
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x(
-; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
 ; CHECK-NEXT:    store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %result = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %result, ptr addrspace(1) %out
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[FPTR]], @also_empty
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP3:%.*]]
@@ -244,220 +331,261 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK:       5:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       6:
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
   call void %fptr()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call {i32, i32} asm sideeffect "; def $0", "=a,=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=v"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "a"(ptr poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call ptr asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call ptr asm sideeffect "; def $0", "=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call <2 x ptr> asm sideeffect "; def $0", "=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call {i32, i32} asm sideeffect "; def $0", "={a0},={a[4:5]}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_clobber() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; clobber $0", "~{a4}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; clobber $0", "~{a[10:13]}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; clobber $0", "~{a256}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; clobber $0", "~{a255}"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "{a256}"(i32 poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call <32 x i32> asm sideeffect "; def $0", "=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "a"(<32 x i32> poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call <32 x i32> asm sideeffect "; use $0", "=a,a"(<32 x i32> poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @vreg_use_exceeds_register_file() {
 ; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   call void asm sideeffect "; use $0", "a"(<257 x i32> poison)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @vreg_def_exceeds_register_file() {
 ; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <257 x i32> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call <257 x i32> asm sideeffect "; def $0", "=a"()
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @multiple() {
 ; CHECK-LABEL: define amdgpu_kernel void @multiple(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call {<16 x i32>, <8 x i32>, <8 x i32>} asm sideeffect "; def $0", "=a,=a,=a,a,a,a"(<4 x i32> splat (i32 0), <8 x i32> splat (i32 1), i64 999)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @earlyclobber_0() {
 ; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call <8 x i32> asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call <8 x i32> asm sideeffect "; def $0", "=&a,a"(i32 0)
+  call void @use_most()
   ret void
 }
 
 define amdgpu_kernel void @earlyclobber_1() {
 ; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) #[[ATTR1]] {
 ; CHECK-NEXT:    [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
 ; CHECK-NEXT:    ret void
 ;
   %def = call { <8 x i32>, <16 x i32 > } asm sideeffect "; def $0, $1", "=&a,=&a,a,a"(i32 0, <16 x i32> splat (i32 1))
+  call void @use_most()
   ret void
 }
 
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-agpr-alloc"="0" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-agpr-alloc"="0" }
 ;.

>From 8d4f9325e1a287853792cfb0b23cc584519e542e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 4 Oct 2025 19:39:35 +0900
Subject: [PATCH 3/4] More tests

---
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 49bcc9de2b644..f19d563067eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -580,6 +580,42 @@ define amdgpu_kernel void @earlyclobber_1() {
   ret void
 }
 
+define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512() {
+; CHECK-LABEL: define amdgpu_kernel void @physreg_a32__vreg_a256__vreg_a512(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0, $1, $2", "{a16},a,a"(i32 poison, <8 x i32> poison, <16 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512() {
+; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32__def_vreg_a256__def_vreg_a512(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, <8 x i32>, <16 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call {i32, <8 x i32>, <16 x i32>} asm sideeffect "; def $0, $1, $2", "={a16},=a,=a"()
+  call void @use_most()
+  ret void
+}
+
+define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256() {
+; CHECK-LABEL: define amdgpu_kernel void @physreg_def_a32___def_vreg_a512_use_vreg_a256(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, <16 x i32> } asm sideeffect "
+; CHECK-NEXT:    call void @use_most()
+; CHECK-NEXT:    ret void
+;
+  call {i32, <16 x i32>} asm sideeffect "; def $0, $1, $2", "={a16},=a,a"(<8 x i32> poison)
+  call void @use_most()
+  ret void
+}
+
 attributes #0 = { "amdgpu-agpr-alloc"="0" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }

>From fc3aa2a708487f9d145e2bc89183cf0cddde7a32 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 4 Oct 2025 19:53:08 +0900
Subject: [PATCH 4/4] Rework accounting of mixed physical and virtual registers

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 23 +++++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 071f6c55b04fd..48a7e6a463c3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1243,29 +1243,34 @@ static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
     }
 
     for (StringRef Code : CI.Codes) {
+      unsigned RegCount = 0;
+
       if (Code.starts_with("a")) {
         // Virtual register, compute number of registers based on the type.
         //
         // We ought to be going through TargetLowering to get the number of
         // registers, but we should avoid the dependence on CodeGen here.
-        unsigned RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
-        if (CI.Type == InlineAsm::isOutput) {
-          AGPRDefCount += RegCount;
-          if (CI.isEarlyClobber)
-            AGPRUseCount += RegCount;
-        } else
-          AGPRUseCount += RegCount;
+        RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
       } else {
         // Physical register reference
         auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
-        if (Kind == 'a')
+        if (Kind == 'a') {
+          RegCount = NumRegs;
           MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+        }
       }
+
+      if (CI.Type == InlineAsm::isOutput) {
+        AGPRDefCount += RegCount;
+        if (CI.isEarlyClobber)
+          AGPRUseCount += RegCount;
+      } else
+        AGPRUseCount += RegCount;
     }
   }
 
   unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
-  return std::min(MaxVirtReg + MaxPhysReg, 256u);
+  return std::min(std::max(MaxVirtReg, MaxPhysReg), 256u);
 }
 
 // TODO: Migrate to range merge of amdgpu-agpr-alloc.



More information about the llvm-commits mailing list