[clang] [llvm] [AMDGPU] Add initial support for VGPR as memory (PR #205435)

Tue Jun 30 15:52:28 PDT 2026

https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/205435

>From b13adc506c96a4686ba827e24310ff273d53d917 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 19 Jun 2026 14:05:08 -0500
Subject: [PATCH 1/2] Add initial support for VGPR as memory

---
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/AttrDocs.td         |  20 ++
 .../clang/Basic/DiagnosticCommonKinds.td      |   5 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   3 +
 clang/include/clang/Sema/SemaAMDGPU.h         |   1 +
 clang/lib/CodeGen/CGDecl.cpp                  |  41 +++-
 clang/lib/Sema/SemaAMDGPU.cpp                 |  14 ++
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +
 .../CodeGenHIP/amdgpu-vgpr-O0-warning.hip     |  14 ++
 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip    |  19 ++
 ...a-attribute-supported-attributes-list.test |   1 +
 clang/test/SemaCUDA/amdgpu-vgpr.cu            |  28 +++
 llvm/include/llvm/Support/AMDGPUAddrSpace.h   |   4 +
 llvm/lib/IR/VerifierAMDGPU.cpp                |   6 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  16 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +++++++++++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp       | 145 +++++++++++
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 231 ++++++++++++++++--
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  28 ++-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  19 ++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  12 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  11 +
 .../AMDGPU/amdgpu-vgpr-allocate-basic.ll      | 109 +++++++++
 .../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll     |  20 ++
 .../CodeGen/AMDGPU/as-vgpr-alloca-static.ll   |  58 +++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   9 +-
 llvm/test/Verifier/AMDGPU/alloca.ll           |  55 +++--
 31 files changed, 983 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
 create mode 100644 clang/test/SemaCUDA/amdgpu-vgpr.cu
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 3f57104d474a7..20f42ce4bd8f7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2522,6 +2522,14 @@ def AMDGPUMaxNumWorkGroups : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUVGPR : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_vgpr">];
+  let Documentation = [AMDGPUVGPRDocs];
+  let Subjects = SubjectList<[LocalVar], ErrorDiag>;
+  // Only meaningful in CUDA/HIP; semantic checks restrict it to kernel locals.
+  let LangOpts = [CUDA];
+}
+
 def BPFPreserveAccessIndex : InheritableAttr,
                              TargetSpecificAttr<TargetBPF>  {
   let Spellings = [Clang<"preserve_access_index">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index d806adb4be4b8..7dcf35fe3bd83 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3604,6 +3604,26 @@ An error will be given if:
   }];
 }
 
+def AMDGPUVGPRDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+This attribute requests that a kernel-local variable be allocated in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
+so that accesses with statically known indices lower to vector register
+copies instead of scratch memory traffic.
+
+Clang supports the ``__attribute__((amdgpu_vgpr))`` or
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
+local variables declared in a ``__global__`` (kernel) function; applying it to
+a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
+error.
+
+Known limitation: the request is only honored with optimizations enabled. At
+``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
+emitted.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..fe03be43c80c7 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,6 +319,11 @@ def warn_stack_protection_ignore_attribute : Warning<
   "'stack_protector_ignore' attribute ignored due to "
   "'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
 
+def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
+  "%0 is not guaranteed to keep the variable in vector registers at -O0; "
+  "it may fall back to scratch memory">,
+  InGroup<DiagGroup<"amdgpu-vgpr">>;
+
 def warn_slh_does_not_support_asm_goto : Warning<
   "speculative load hardening does not protect functions with asm goto">,
   InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7360c9bbab60a..a5e56e94509da 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3711,6 +3711,9 @@ def err_attribute_argument_invalid : Error<
 def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
   "'amdgpu_flat_work_group_size' attribute must match "
   "'reqd_work_group_size' product">;
+def err_amdgpu_vgpr_not_kernel_local : Error<
+  "%0 attribute can only be applied to local variables in "
+  "'__global__' (kernel) functions">;
 def err_attribute_argument_is_zero : Error<
   "%0 attribute must be greater than 0">;
 def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index a6205534e0de3..9cb74ed74f4b9 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -79,6 +79,7 @@ class SemaAMDGPU : public SemaBase {
   void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+  void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
 
   /// Expand a valid use of the feature identification builtins into its
   /// corresponding sequence of instructions.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 7608f8cb6fc7a..bca2d11d47c6a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include <optional>
 
 using namespace clang;
@@ -1601,9 +1602,37 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // Create the alloca.  Note that we set the name separately from
       // building the instruction so that it's there even in no-asserts
       // builds.
-      address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
-                                 allocaAlignment, D.getName(),
-                                 /*ArraySize=*/nullptr, &AllocaAddr);
+      //
+      // "VGPR as memory" objects keep their backing registers only once the
+      // optimizing register allocator runs. At -O0 the backend cannot lower
+      // these accesses (e.g. when the address escapes a basic block), so the
+      // request is not honored: fall back to an ordinary (scratch) alloca and
+      // warn, matching the documented behavior.
+      // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
+      // backing tuple to scratch) so this fallback can be removed.
+      const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+      const bool UseVGPRMemory =
+          VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
+      if (VGPRAttr && !UseVGPRMemory)
+        CGM.getDiags().Report(D.getLocation(),
+                              diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
+            << VGPRAttr;
+
+      if (UseVGPRMemory) {
+        // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
+        // address space so that statically indexed accesses lower to vector
+        // register copies instead of scratch memory.
+        auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
+                                        /*ArraySize=*/nullptr, D.getName(),
+                                        AllocaInsertPt->getIterator());
+        AI->setAlignment(allocaAlignment.getAsAlign());
+        AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+        address = AllocaAddr;
+      } else {
+        address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
+                                   allocaAlignment, D.getName(),
+                                   /*ArraySize=*/nullptr, &AllocaAddr);
+      }
 
       // Don't emit lifetime markers for MSVC catch parameters. The lifetime of
       // the catch parameter starts in the catchpad instruction, and we can't
@@ -1612,8 +1641,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
           D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
 
       // Emit a lifetime intrinsic if meaningful. There's no point in doing this
-      // if we don't have a valid insertion point (?).
-      if (HaveInsertPoint() && !IsMSCatchParam) {
+      // if we don't have a valid insertion point (?). "VGPR as memory" allocas
+      // live in a non-alloca address space, so the standard lifetime markers
+      // (which assume the alloca address space) are skipped for them.
+      if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
         // If there's a jump into the lifetime of this variable, its lifetime
         // gets broken up into several regions in IR, which requires more work
         // to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index bd9e7e7b71ed6..0568ab0b60a07 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/AST/Expr.h"
@@ -626,6 +627,19 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
   addAMDGPUFlatWorkGroupSizeAttr(D, AL, MinExpr, MaxExpr);
 }
 
+void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
+  // The LocalVar subject list already guarantees this is a local variable.
+  // Restrict it further to locals declared directly in a __global__ kernel;
+  // it is meaningless (and an error) in __device__ or host functions.
+  const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
+  if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
+    Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+    return;
+  }
+
+  D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
+}
+
 static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
                                            Expr *MaxExpr,
                                            const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2159c586e5738..095a11acdd02d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7641,6 +7641,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     S.AMDGPU().handleAMDGPUNumVGPRAttr(D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUVGPR:
+    S.AMDGPU().handleAMDGPUVGPRAttr(D, AL);
+    break;
   case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
     S.AMDGPU().handleAMDGPUMaxNumWorkGroupsAttr(D, AL);
     break;
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
new file mode 100644
index 0000000000000..4d23008b8ef43
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// At -O0 "VGPR as memory" is not honored: the variable falls back to an
+// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
+
+#define __global__ __attribute__((global))
+
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
new file mode 100644
index 0000000000000..9a5c38e48951c
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -emit-llvm -O1 -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
+// memory" address space (addrspace(13)), and its accesses stay in that space.
+
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
+// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
+// CHECK: store i32 %{{.*}}, ptr addrspace(13)
+// CHECK: load i32, ptr addrspace(13)
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr));
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 8bca68e2119e7..e79215f090214 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -7,6 +7,7 @@
 // CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUVGPR (SubjectMatchRule_variable_is_local)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
new file mode 100644
index 0000000000000..6ad3074921b9b
--- /dev/null
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+__global__ void kernel() {
+  int ok[4] __attribute__((amdgpu_vgpr)); // OK
+  (void)ok;
+}
+
+__device__ void device_fn() {
+  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+  (void)bad;
+}
+
+__host__ void host_fn() {
+  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+  (void)bad;
+}
+
+// Not a local variable.
+int global_var __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+
+__global__ void takes_no_args() {
+  // Attribute does not accept arguments.
+  int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
+  (void)bad;
+}
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 01b1510524d0f..e9d3add54d054 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -47,6 +47,10 @@ enum : unsigned {
   BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
                               ///< pointers with an additional index.
 
+  VGPR = 13, ///< Address space for "VGPR as memory": objects backed by VGPRs
+             ///< rather than scratch. Shares its numeric value with the
+             ///< graphics-only CONSTANT_BUFFER_5 alias below.
+
   RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
 
   /// Internal address spaces. Can be freely renumbered.
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index 04cb214ef2520..de9a0c7bef132 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -122,8 +122,10 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
   if (!VS.TT.isAMDGPU())
     return;
 
-  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-    VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+      AI.getAddressSpace() != AMDGPUAS::VGPR)
+    VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
+                   &AI);
 }
 
 bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index e4367811f1ed4..d19333f14ee63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca();
+FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
@@ -276,6 +276,20 @@ struct AMDGPUPromoteAllocaPass
   TargetMachine &TM;
 };
 
+void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+extern char &AMDGPUPrivateObjectVGPRsID;
+
+// Allocates pre-existing VGPR address space allocas without performing any
+// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
+// objects remain functional.
+struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
+  AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
 struct AMDGPUPromoteAllocaToVectorPass
     : OptionalPassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
   AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7330f3b13f3cb..8e289058a2ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
 #include "R600RegisterInfo.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -341,25 +343,159 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  if (!Subtarget->d16PreservesUnusedBits())
-    return;
+// Resolve the constant byte offset within the per-function VGPR file for a
+// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
+// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
+// object.
+static std::optional<unsigned>
+getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
+  unsigned ExtraOffset = 0;
+  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
+      ExtraOffset = C->getZExtValue();
+      Ptr = Ptr.getOperand(0);
+    }
+  }
+  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+  if (!FI)
+    return std::nullopt;
+  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+    return std::nullopt;
+  return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+}
+
+// Lower a load/store of a "VGPR as memory" object into one
+// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
+// offset. The pseudos are later expanded into subregister copies by
+// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
+// split into their dword lanes; sub-dword and non-dword-multiple accesses are
+// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
+// true if \p N was rewritten.
+bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
+  if (auto *Load = dyn_cast<LoadSDNode>(N)) {
+    if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
+        Load->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+    EVT VT = Load->getValueType(0);
+    unsigned Bits = VT.getFixedSizeInBits();
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    std::optional<unsigned> Offset =
+        getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
+    if (!Offset || (*Offset % 4 != 0))
+      return false;
+
+    SDLoc DL(N);
+    unsigned NumDwords = Bits / 32;
+    SmallVector<SDValue, 4> Dwords;
+    SmallVector<SDValue, 4> Chains;
+    for (unsigned I = 0; I != NumDwords; ++I) {
+      SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+                       Load->getChain()};
+      MachineSDNode *Lane = CurDAG->getMachineNode(
+          AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
+      if (I == 0)
+        CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
+      Dwords.push_back(SDValue(Lane, 0));
+      Chains.push_back(SDValue(Lane, 1));
+    }
+
+    SDValue Val;
+    if (NumDwords == 1) {
+      Val = Dwords[0];
+      if (VT != MVT::i32)
+        Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
+    } else {
+      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+      SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
+      Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
+    }
+    SDValue Chain = NumDwords == 1 ? Chains[0]
+                                   : CurDAG->getNode(ISD::TokenFactor, DL,
+                                                     MVT::Other, Chains);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
+    return true;
+  }
+
+  if (auto *Store = dyn_cast<StoreSDNode>(N)) {
+    if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
+        Store->isTruncatingStore())
+      return false;
+    SDValue Val = Store->getValue();
+    EVT VT = Val.getValueType();
+    unsigned Bits = VT.getFixedSizeInBits();
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    std::optional<unsigned> Offset =
+        getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
+    if (!Offset || (*Offset % 4 != 0))
+      return false;
+
+    SDLoc DL(N);
+    unsigned NumDwords = Bits / 32;
+    SmallVector<SDValue, 4> Dwords;
+    if (NumDwords == 1) {
+      if (VT != MVT::i32)
+        Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
+      Dwords.push_back(Val);
+    } else {
+      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+      SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
+      for (unsigned I = 0; I != NumDwords; ++I)
+        Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                         Vec,
+                                         CurDAG->getConstant(I, DL, MVT::i32)));
+    }
+
+    SmallVector<SDValue, 4> Chains;
+    for (unsigned I = 0; I != NumDwords; ++I) {
+      SDValue Ops[] = {Dwords[I],
+                       CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+                       Store->getChain()};
+      MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
+                                                   DL, MVT::Other, Ops);
+      if (I == 0)
+        CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
+      Chains.push_back(SDValue(Lane, 0));
+    }
+    SDValue Chain = NumDwords == 1 ? Chains[0]
+                                   : CurDAG->getNode(ISD::TokenFactor, DL,
+                                                     MVT::Other, Chains);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
+    return true;
+  }
 
-  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+  return false;
+}
 
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   bool MadeChange = false;
-  while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--Position;
-    if (N->use_empty())
-      continue;
-
-    switch (N->getOpcode()) {
-    case ISD::BUILD_VECTOR:
-      // TODO: Match load d16 from shl (extload:i16), 16
-      MadeChange |= matchLoadD16FromBuildVector(N);
-      break;
-    default:
-      break;
+
+  // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
+  // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
+  SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
+  while (VGPRPos != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--VGPRPos;
+    MadeChange |= rewriteVGPRFrameAccess(N);
+  }
+
+  if (Subtarget->d16PreservesUnusedBits()) {
+    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+    while (Position != CurDAG->allnodes_begin()) {
+      SDNode *N = &*--Position;
+      if (N->use_empty())
+        continue;
+
+      switch (N->getOpcode()) {
+      case ISD::BUILD_VECTOR:
+        // TODO: Match load d16 from shl (extload:i16), 16
+        MadeChange |= matchLoadD16FromBuildVector(N);
+        break;
+      default:
+        break;
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 95f85a6151375..cf62874912742 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,6 +67,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   bool matchLoadD16FromBuildVector(SDNode *N) const;
+  bool rewriteVGPRFrameAccess(SDNode *N);
   void PreprocessISelDAG() override;
   void Select(SDNode *N) override;
   void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index d052f3c73920c..376a1ebcc4256 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
+FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
new file mode 100644
index 0000000000000..a3a1cf6f18bed
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -0,0 +1,145 @@
+//===-- AMDGPUPrivateObjectVGPRs.cpp - Lower VGPR-as-memory accesses ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
+/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
+/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
+/// carries a constant byte offset, which selects the dword (subregister) to
+/// copy.
+///
+/// This runs once the function is out of SSA form (so the single backing tuple
+/// can be defined by several subregister copies) and while LiveIntervals is
+/// available. The backing tuple has lane-divergent liveness (its subregisters
+/// are written and read independently), which the whole-register LiveVariables
+/// analysis cannot represent; the pass therefore updates the subregister-aware
+/// LiveIntervals directly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-private-object-vgprs"
+
+namespace {
+
+class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Private Object VGPRs";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addPreserved<LiveIntervalsWrapperPass>();
+    AU.addPreserved<SlotIndexesWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+                "AMDGPU Private Object VGPRs", false, false)
+
+char AMDGPUPrivateObjectVGPRs::ID = 0;
+
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+
+bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Collect the pseudos and determine how many dwords the backing tuple needs.
+  SmallVector<MachineInstr *, 8> Worklist;
+  unsigned NumDwords = 0;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
+          Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+        continue;
+      unsigned ByteOffset = MI.getOperand(1).getImm();
+      NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+      Worklist.push_back(&MI);
+    }
+  }
+
+  if (Worklist.empty())
+    return false;
+
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+  const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
+  assert(RC && "no VGPR register class for VGPR-as-memory object");
+  Register Storage = MRI.createVirtualRegister(RC);
+
+  // Define the whole tuple up front so partial (subregister) writes and reads
+  // of uninitialized lanes are well formed.
+  MachineBasicBlock &Entry = MF.front();
+  MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
+                                 TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
+  LIS->InsertMachineInstrInMaps(*ImpDef);
+
+  for (MachineInstr *MI : Worklist) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    const DebugLoc &DL = MI->getDebugLoc();
+    unsigned Dword = MI->getOperand(1).getImm() / 4;
+    unsigned SubReg = NumDwords == 1
+                          ? AMDGPU::NoSubRegister
+                          : SIRegisterInfo::getSubRegFromChannel(Dword);
+
+    MachineInstr *Copy;
+    if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+      Register Dst = MI->getOperand(0).getReg();
+      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
+                 .addReg(Storage, {}, SubReg);
+    } else {
+      Register Src = MI->getOperand(0).getReg();
+      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
+                 .addReg(Storage, RegState::Define, SubReg)
+                 .addReg(Src);
+    }
+    // The copy takes the pseudo's slot, so the intervals of the copied
+    // load/store operand stay valid.
+    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+    MI->eraseFromParent();
+  }
+
+  // The backing tuple is brand new; compute its (subregister) live interval.
+  LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
+
+  // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
+  // form disconnected value-number components within the single tuple, which an
+  // individual live interval must not contain. Split them into separate
+  // virtual registers, exactly as the register coalescer does for the intervals
+  // it leaves behind.
+  SmallVector<LiveInterval *, 4> SplitLIs;
+  LIS->splitSeparateComponents(LI, SplitLIs);
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2223b9d036fa1..c587302c3bbae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -138,6 +139,7 @@ class AMDGPUPromoteAllocaImpl {
   unsigned MaxVGPRs;
   unsigned VGPRBudgetRatio;
   unsigned MaxVectorRegs;
+  unsigned AllocVGPROffset = 0;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -162,6 +164,10 @@ class AMDGPUPromoteAllocaImpl {
   void analyzePromoteToVector(AllocaAnalysis &AA) const;
   void promoteAllocaToVector(AllocaAnalysis &AA);
   void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+
+  /// Allocate an alloca that already lives in the VGPR address space to a range
+  /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
+  void allocateVgprs(AllocaAnalysis &AA);
   bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
                              SetVector<IntrinsicInst *> &DeferredIntrs);
   void
@@ -179,7 +185,11 @@ class AMDGPUPromoteAllocaImpl {
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
   }
 
-  bool run(Function &F, bool PromoteToLDS);
+  /// IsLatePass is true when invoked as a codegen pass and false when invoked
+  /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
+  /// requests only the work strictly required for functionality (i.e. VGPR
+  /// allocation), skipping the optimization-oriented promotions.
+  bool run(Function &F, bool IsLatePass, bool NoOpt);
 };
 
 // FIXME: This can create globals so should be a module pass.
@@ -187,26 +197,34 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 public:
   static char ID;
 
-  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+  explicit AMDGPUPromoteAlloca(
+      CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+      : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
       return AMDGPUPromoteAllocaImpl(
                  TPC->getTM<TargetMachine>(), *F.getParent(),
                  getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
-          .run(F, /*PromoteToLDS*/ true);
+          .run(F, /*IsLatePass=*/true, NoOpt);
+    }
     return false;
   }
 
-  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+  StringRef getPassName() const override {
+    return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
+  }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  bool NoOpt;
 };
 
 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -251,7 +269,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
   bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
-                     .run(F, /*PromoteToLDS=*/true);
+                     .run(F, /*IsLatePass=*/true, /*NoOpt=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -264,7 +282,20 @@ PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
   bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
-                     .run(F, /*PromoteToLDS=*/false);
+                     .run(F, /*IsLatePass=*/false, /*NoOpt=*/false);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
+                     .run(F, /*IsLatePass=*/true, /*NoOpt=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -273,8 +304,8 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PreservedAnalyses::all();
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca() {
-  return new AMDGPUPromoteAlloca();
+FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
+  return new AMDGPUPromoteAlloca(OptLevel);
 }
 
 bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -367,14 +398,121 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
     VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
 }
 
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
-  if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
+// A "VGPR as memory" object can only be realized in registers today when every
+// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
+// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
+// indexing and escaping addresses need gfx13 support, which is not yet
+// available; such objects fall back to scratch instead.
+//
+// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
+// support so this fallback is no longer needed.
+static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
+                                            const DataLayout &DL) {
+  // An access is lowerable if it covers a whole number of dwords and starts at
+  // a dword-aligned constant offset from the alloca.
+  auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
+    if (!Simple)
+      return false;
+    uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+    const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
+        DL, Off, /*AllowNonInbounds=*/true);
+    return Base == &AI && Off.urem(4) == 0;
+  };
+
+  SmallVector<const Use *, 16> Worklist;
+  for (const Use &U : AI.uses())
+    Worklist.push_back(&U);
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    const User *Usr = U->getUser();
+
+    if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+      for (const Use &GU : GEP->uses())
+        Worklist.push_back(&GU);
+      continue;
+    }
+    if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
+      if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
+        return false;
+      continue;
+    }
+    if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
+      // The pointer must be the address operand, not a stored value (escape).
+      if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
+        return false;
+      if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
+                    SI->isSimple()))
+        return false;
+      continue;
+    }
+    // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
+    // otherwise not statically lowerable.
+    return false;
+  }
+  return true;
+}
+
+// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
+// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
+// falls back to ordinary scratch.
+static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
+  SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
+  for (Use *U : Uses) {
+    User *Usr = U->getUser();
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+      IRBuilder<> B(GEP);
+      SmallVector<Value *, 4> Indices(GEP->indices());
+      Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
+                                  GEP->getName(), GEP->getNoWrapFlags());
+      rewriteVGPRPointerToScratch(GEP, NewGEP);
+      GEP->eraseFromParent();
+      continue;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(Usr);
+        II && II->isLifetimeStartOrEnd()) {
+      II->eraseFromParent();
+      continue;
+    }
+    // Loads, stores, address-space casts and call arguments only need this
+    // operand repointed; their result types do not depend on the operand's
+    // address space.
+    U->set(New);
+  }
+}
+
+static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
+  auto *NewAI = new AllocaInst(
+      AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
+      AI->getAlign(), AI->getName(), AI->getIterator());
+  NewAI->setDebugLoc(AI->getDebugLoc());
+  rewriteVGPRPointerToScratch(AI, NewAI);
+  AI->eraseFromParent();
+}
+
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
+  assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
+  if (!IsLatePass && DisablePromoteAllocaToVector)
     return false;
 
+  bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
+  bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
+
   bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
   MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM, F) : 128;
   setFunctionLimits(F);
 
+  // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
+  // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const bool TargetSupportsVGPRAsMemory =
+      ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
+
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
@@ -391,8 +529,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
 
       AllocaAnalysis AA{AI};
+      if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
+        // Allocas that already live in the VGPR address space only need to be
+        // assigned VGPRs, which is required for functionality.
+        if (IsLatePass)
+          Allocas.push_back(std::move(AA));
+        continue;
+      }
+      if (!PromoteToVector && !PromoteToLDS)
+        continue;
       if (collectAllocaUses(AA)) {
-        analyzePromoteToVector(AA);
+        if (PromoteToVector)
+          analyzePromoteToVector(AA);
         if (PromoteToLDS)
           analyzePromoteToLDS(AA);
         if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -403,8 +551,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
-  stable_sort(Allocas,
-              [](const auto &A, const auto &B) { return A.Score > B.Score; });
+  stable_sort(Allocas, [](const auto &A, const auto &B) {
+    // Prioritize pre-existing VGPR allocas, since their allocation must not
+    // fail.
+    bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+    bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+    if (AIsVGPR != BIsVGPR)
+      return AIsVGPR;
+    return A.Score > B.Score;
+  });
 
   // clang-format off
   LLVM_DEBUG(
@@ -417,6 +572,39 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   bool Changed = false;
   SetVector<IntrinsicInst *> DeferredIntrs;
   for (AllocaAnalysis &AA : Allocas) {
+    if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
+      // Fall back to scratch (and warn) when the object can't be kept in
+      // registers, so the program still compiles correctly: either the target
+      // does not support "VGPR as memory", or the access pattern (dynamic
+      // index, sub-dword, escaping address) is not yet supported.
+      const char *Unsupported = nullptr;
+      if (!TargetSupportsVGPRAsMemory)
+        Unsupported = "not supported on this target";
+      else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
+        Unsupported = "dynamic indexing, sub-dword access, or escaping address "
+                      "is not yet supported";
+      if (Unsupported) {
+        F.getContext().diagnose(DiagnosticInfoUnsupported(
+            F,
+            Twine("'amdgpu_vgpr' object could not be kept in vector registers "
+                  "(") +
+                Unsupported + "); using scratch memory instead",
+            AA.Alloca->getDebugLoc(), DS_Warning));
+        demoteVGPRAllocaToScratch(AA.Alloca);
+        Changed = true;
+        continue;
+      }
+      const unsigned AllocaCost =
+          AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
+      allocateVgprs(AA);
+      // Account for the consumed VGPRs in the vectorization budget.
+      if (VectorizationBudget > AllocaCost)
+        VectorizationBudget -= AllocaCost;
+      else
+        VectorizationBudget = 0;
+      Changed = true;
+      continue;
+    }
     if (AA.Vector.Ty) {
       std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
       assert(Size); // Expected to succeed on non-array alloca.
@@ -451,6 +639,21 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   return Changed;
 }
 
+void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
+  LLVMContext &Ctx = Mod->getContext();
+  const unsigned AllocaSize =
+      DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
+
+  // Record where the object was allocated within the VGPR file.
+  Type *I32 = Type::getInt32Ty(Ctx);
+  AA.Alloca->setMetadata(
+      "amdgpu.allocated.vgprs",
+      MDNode::get(
+          Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
+                ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
+  AllocVGPROffset += alignTo(AllocaSize, 4);
+}
+
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 06bfc7e1a5162..7fc233be91fe0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -668,6 +668,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerSGPRSpillsLegacyPass(*PR);
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesLegacyPass(*PR);
+  initializeAMDGPUPrivateObjectVGPRsPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1500,9 +1501,12 @@ void AMDGPUPassConfig::addIRPasses() {
 
   addPass(createAtomicExpandLegacyPass());
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    addPass(createAMDGPUPromoteAlloca());
+  // With optimizations enabled, do the full promotion of allocas. Without
+  // optimizations, this only allocates pre-existing VGPR address space allocas,
+  // which is required for functionality.
+  addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
 
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses();
 
@@ -1717,6 +1721,11 @@ void GCNPassConfig::addFastRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
+  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+  // At O0 there is no register coalescer; anchor on TwoAddress, where
+  // LiveIntervals is already available.
+  insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
+
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
   TargetPassConfig::addFastRegAlloc();
@@ -1743,6 +1752,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
+  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+  // This runs after the coalescer so it does not perturb the kill flags that
+  // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
+  // the LiveIntervals the register allocator consumes next.
+  insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
+
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
 
@@ -2283,8 +2298,15 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
 
   addFunctionPass(AtomicExpandPass(TM), PMW);
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+  // With optimizations enabled, do the full promotion of allocas. Without
+  // optimizations, only allocate pre-existing VGPR address space allocas, which
+  // is required for functionality.
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
     addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
+  else
+    addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses(PMW);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 46edc44e2cc05..dd25ab71997d7 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -100,6 +100,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPreloadKernArgProlog.cpp
   AMDGPUPreloadKernelArguments.cpp
   AMDGPUPrintfRuntimeBinding.cpp
+  AMDGPUPrivateObjectVGPRs.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPUPromoteKernelArguments.cpp
   AMDGPURegBankCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 750cb1973e21f..3594caef86782 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,6 +1243,25 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
 }
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
+// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
+// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
+// within the per-function VGPR file. They are produced during instruction
+// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
+// pass before register allocation.
+let hasSideEffects = 0 in {
+def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
+                                        (ins i32imm:$offset)> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
+                                         (ins VGPR_32:$vdata, i32imm:$offset)> {
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+} // End hasSideEffects = 0
+
 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 // needs to be used and an extra instruction to move between VGPR and AGPR.
 // UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96571dd028b14..7528cd2a009a3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1779,6 +1780,17 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
   return false;
 }
 
+AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
+  const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
+  assert(MD && MD->getNumOperands() == 2 &&
+         "expected !amdgpu.allocated.vgprs metadata with 2 operands");
+  unsigned Address =
+      mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+  unsigned Size =
+      mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+  return {Address, Size};
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2c61abf946f99..923c5c3a988fd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -31,6 +31,7 @@ struct amd_kernel_code_t;
 namespace llvm {
 
 struct Align;
+class AllocaInst;
 class Argument;
 class Function;
 class GlobalValue;
@@ -1037,6 +1038,16 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
+/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
+/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
+/// to within the VGPR file, and its size in bytes.
+struct AllocatedVGPRsMetadata {
+  unsigned Address;
+  unsigned Size;
+
+  static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
+};
+
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
new file mode 100644
index 0000000000000..f6c64c5121867
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
+
+define void @vgpr_alloca() {
+; CHECK-LABEL: define void @vgpr_alloca(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  ret void
+}
+
+define void @vgpr_alloca_multiple() {
+; CHECK-LABEL: define void @vgpr_alloca_multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
+; CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[B]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4, addrspace(13)
+  %b = alloca [2 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  store i32 0, ptr addrspace(13) %b
+  ret void
+}
+
+define void @private_alloca_unchanged() {
+; CHECK-LABEL: define void @private_alloca_unchanged(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    store i64 42, ptr addrspace(5) [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i64], align 4, addrspace(5)
+  store i64 42, ptr addrspace(5) %a
+  ret void
+}
+
+declare void @use(ptr)
+
+; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
+; back to ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
+; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(5) [[P2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+; A VGPR object whose address escapes (here via a cast to a generic pointer, as
+; the frontend emits) cannot be kept in registers yet, so it falls back to
+; ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_escaping() {
+; CHECK-LABEL: define void @vgpr_alloca_escaping(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
+; CHECK-NEXT:    call void @use(ptr [[CAST]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %cast = addrspacecast ptr addrspace(13) %a to ptr
+  call void @use(ptr %cast)
+  ret void
+}
+
+; Whole-dword-multiple accesses (here i64) stay in VGPRs.
+define void @vgpr_alloca_i64(i64 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_i64(
+; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
+; CHECK-NEXT:    store i64 [[V]], ptr addrspace(13) [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i64, align 8, addrspace(13)
+  store i64 %v, ptr addrspace(13) %a
+  ret void
+}
+
+; Sub-dword accesses are not supported yet, so the object falls back to scratch.
+define void @vgpr_alloca_subdword(i16 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_subdword(
+; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
+; CHECK-NEXT:    store i16 [[V]], ptr addrspace(5) [[A1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [2 x i16], align 4, addrspace(13)
+  store i16 %v, ptr addrspace(13) %a
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0, i32 16}
+; CHECK: [[META1]] = !{i32 0, i32 4}
+; CHECK: [[META2]] = !{i32 4, i32 8}
+; CHECK: [[META3]] = !{i32 0, i32 8}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
new file mode 100644
index 0000000000000..63ba44b479279
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -0,0 +1,20 @@
+; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
+; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
+; scratch.
+
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+
+define void @vgpr_obj() {
+; SUPP:   alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
+; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
new file mode 100644
index 0000000000000..ea914907a900d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
@@ -0,0 +1,58 @@
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
+; indices must lower to register copies, never to scratch/buffer memory traffic.
+
+; CHECK-LABEL: store_load_i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_i32(i32 %v) {
+  %a = alloca i32, align 4, addrspace(13)
+  store i32 %v, ptr addrspace(13) %a
+  %l = load i32, ptr addrspace(13) %a
+  %r = add i32 %l, 1
+  ret i32 %r
+}
+
+; CHECK-LABEL: store_load_array:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_array(i32 %v) {
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
+  %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
+  store i32 %v, ptr addrspace(13) %p1
+  store i32 7, ptr addrspace(13) %p3
+  %l1 = load i32, ptr addrspace(13) %p1
+  %l3 = load i32, ptr addrspace(13) %p3
+  %s = add i32 %l1, %l3
+  ret i32 %s
+}
+
+; A 64-bit (two-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_i64:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i64 @store_load_i64(i64 %v) {
+  %a = alloca i64, align 8, addrspace(13)
+  store i64 %v, ptr addrspace(13) %a
+  %l = load i64, ptr addrspace(13) %a
+  %r = add i64 %l, 1
+  ret i64 %r
+}
+
+; A vector (four-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_v4i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
+  %a = alloca <4 x i32>, align 16, addrspace(13)
+  store <4 x i32> %v, ptr addrspace(13) %a
+  %l = load <4 x i32>, ptr addrspace(13) %a
+  ret <4 x i32> %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 0c591ec5b4669..0dbabd2991bc4 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -29,6 +29,7 @@
 ; GCN-O0-NEXT: amdgpu-lower-module-lds
 ; GCN-O0-NEXT: function
 ; GCN-O0-NEXT:   atomic-expand
+; GCN-O0-NEXT:   amdgpu-vgpr-allocate
 ; GCN-O0-NEXT:   verify
 ; GCN-O0-NEXT:   unreachableblockelim
 ; GCN-O0-NEXT:   ee-instrument<post-inline>
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 070c873798647..aabfadd33e976 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,11 +49,13 @@
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand Atomic instructions
+; GCN-O0-NEXT:      Dominator Tree Construction
+; GCN-O0-NEXT:      Natural Loop Information
+; GCN-O0-NEXT:      AMDGPU VGPR Allocate
 ; GCN-O0-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
 ; GCN-O0-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O0-NEXT:      Expand reduction intrinsics
-; GCN-O0-NEXT:      Dominator Tree Construction
 ; GCN-O0-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O0-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O0-NEXT:    AMDGPU lower intrinsics
@@ -115,6 +117,7 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
+; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Whole Quad Mode
 ; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -359,6 +362,7 @@
 ; GCN-O1-NEXT:        Live Interval Analysis
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Register Coalescer
+; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
@@ -676,6 +680,7 @@
 ; GCN-O1-OPTS-NEXT:        Live Interval Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Register Coalescer
+; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
@@ -998,6 +1003,7 @@
 ; GCN-O2-NEXT:        Live Interval Analysis
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Register Coalescer
+; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        Rewrite Partial Register Uses
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
@@ -1334,6 +1340,7 @@
 ; GCN-O3-NEXT:        Live Interval Analysis
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Register Coalescer
+; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        Rewrite Partial Register Uses
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index f31d6228d7936..bd760de79c9d0 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,23 +2,23 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
 define void @static_alloca() {
 entry:
@@ -32,26 +32,27 @@ entry:
   %alloca.7 = alloca i32, align 4, addrspace(7)
   %alloca.8 = alloca i32, align 4, addrspace(8)
   %alloca.9 = alloca i32, align 4, addrspace(9)
+  %alloca.13 = alloca i32, align 4, addrspace(13)
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i32(i32 %n) {
 entry:
@@ -68,23 +69,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i64(i64 %n) {
 entry:

>From 6d47f41fc10ba761312d9e98e15e51ed78c7e997 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Wed, 24 Jun 2026 19:32:40 -0500
Subject: [PATCH 2/2] Lower VGPR-as-memory accesses via REG_LOAD/REG_STORE
 nodes

---
 clang/include/clang/Basic/AttrDocs.td         |  26 +-
 .../clang/Basic/DiagnosticCommonKinds.td      |   5 -
 .../clang/Basic/DiagnosticSemaKinds.td        |  12 +-
 clang/include/clang/Sema/SemaAMDGPU.h         |   4 +
 clang/lib/CodeGen/CGDecl.cpp                  |  43 +-
 clang/lib/Sema/SemaAMDGPU.cpp                 |  38 +-
 clang/lib/Sema/SemaDecl.cpp                   |   2 +
 clang/test/CodeGen/target-data.c              |   4 +-
 .../CodeGenHIP/amdgpu-vgpr-O0-warning.hip     |  14 -
 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip      |  19 +
 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip    |  25 +-
 clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl |   2 +-
 clang/test/SemaCUDA/amdgpu-vgpr.cu            |  26 +-
 llvm/docs/AMDGPUUsage.rst                     |  37 +-
 llvm/include/llvm/Support/AMDGPUAddrSpace.h   |  13 +-
 llvm/lib/IR/AutoUpgrade.cpp                   |   5 +
 llvm/lib/IR/Verifier.cpp                      |   7 +
 llvm/lib/IR/VerifierAMDGPU.cpp                |  51 ++-
 llvm/lib/IR/VerifierInternal.h                |   4 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  17 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   1 -
 .../Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp  | 315 +++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   3 +-
 .../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp       | 269 +++++++----
 .../Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h  |  23 +
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 231 +---------
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  14 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  48 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 433 +++++++++++++++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   3 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  11 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  69 ++-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   6 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  10 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  46 ++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   8 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  12 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  19 +-
 llvm/lib/TargetParser/TargetDataLayout.cpp    |   4 +-
 .../AMDGPU/amdgpu-vgpr-allocate-basic.ll      | 109 -----
 .../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll     |  20 -
 .../CodeGen/AMDGPU/as-vgpr-alloca-static.ll   |  58 ---
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   7 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  19 +-
 llvm/test/CodeGen/AMDGPU/nullptr.ll           |   2 +-
 .../CodeGen/AMDGPU/sgpr-regalloc-flags.ll     |   1 +
 .../AMDGPU/vgpr-as-memory-addrspacecast.ll    |  49 ++
 .../AMDGPU/vgpr-as-memory-callgraph.ll        |  50 ++
 .../AMDGPU/vgpr-as-memory-constexpr.ll        |  44 ++
 .../CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll  | 346 ++++++++++++++
 .../AMDGPU/vgpr-as-memory-error-const-oob.ll  |  15 +
 .../vgpr-as-memory-error-dynamic-toolarge.ll  |  15 +
 .../vgpr-as-memory-error-extern-call.ll       |  19 +
 .../vgpr-as-memory-error-indirect-call.ll     |  15 +
 .../vgpr-as-memory-error-inlineasm-clobber.ll |  15 +
 .../vgpr-as-memory-error-ungrouped-call.ll    |  19 +
 .../vgpr-as-memory-error-unsupported-more.ll  |  32 ++
 .../vgpr-as-memory-error-unsupported.ll       |  15 +
 .../AMDGPU/vgpr-as-memory-function-ref.ll     |  18 +
 .../AMDGPU/vgpr-as-memory-gisel-fallback.ll   |  28 ++
 .../AMDGPU/vgpr-as-memory-lower-module.ll     |  80 ++++
 .../CodeGen/AMDGPU/vgpr-as-memory-subdword.ll |  63 +++
 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll    |  75 +++
 llvm/test/Verifier/AMDGPU/alloca.ll           |  56 +--
 llvm/test/Verifier/AMDGPU/vgpr-memory.ll      |  49 ++
 .../Bitcode/DataLayoutUpgradeTest.cpp         |  38 +-
 68 files changed, 2446 insertions(+), 859 deletions(-)
 delete mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
 create mode 100644 llvm/test/Verifier/AMDGPU/vgpr-memory.ll

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7dcf35fe3bd83..f8720cd67efad 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3607,20 +3607,22 @@ An error will be given if:
 def AMDGPUVGPRDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-This attribute requests that a kernel-local variable be allocated in the
-"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
-so that accesses with statically known indices lower to vector register
-copies instead of scratch memory traffic.
+This attribute requests that a device-side local variable be placed in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target, so that
+its accesses lower to vector register copies (constant index) or hardware
+register-indexing sequences (dynamic index) instead of scratch memory traffic.
+
+Such a variable is backed by a fixed block of vector registers rather than the
+stack, so - like an LDS/``__shared__`` variable - it is emitted as an internal
+global in ``addrspace(13)`` with a ``poison`` initializer; its contents are
+undefined until written. This is honored at every optimization level, including
+``-O0``.
 
 Clang supports the ``__attribute__((amdgpu_vgpr))`` or
-``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
-local variables declared in a ``__global__`` (kernel) function; applying it to
-a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
-error.
-
-Known limitation: the request is only honored with optimizations enabled. At
-``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
-emitted.
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. Like ``__shared__``, it may be
+applied to a local variable in any device-side function (a ``__global__``
+kernel or a ``__device__`` function); applying it to a variable in host code is
+an error.
   }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index fe03be43c80c7..f2ed2f4698b8d 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,11 +319,6 @@ def warn_stack_protection_ignore_attribute : Warning<
   "'stack_protector_ignore' attribute ignored due to "
   "'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
 
-def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
-  "%0 is not guaranteed to keep the variable in vector registers at -O0; "
-  "it may fall back to scratch memory">,
-  InGroup<DiagGroup<"amdgpu-vgpr">>;
-
 def warn_slh_does_not_support_asm_goto : Warning<
   "speculative load hardening does not protect functions with asm goto">,
   InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a5e56e94509da..74b933ec743bb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3711,9 +3711,15 @@ def err_attribute_argument_invalid : Error<
 def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
   "'amdgpu_flat_work_group_size' attribute must match "
   "'reqd_work_group_size' product">;
-def err_amdgpu_vgpr_not_kernel_local : Error<
-  "%0 attribute can only be applied to local variables in "
-  "'__global__' (kernel) functions">;
+def err_amdgpu_vgpr_host : Error<
+  "'amdgpu_vgpr' variables are not allowed in "
+  "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
+def err_amdgpu_vgpr_initializer : Error<
+  "a variable with the 'amdgpu_vgpr' attribute cannot have an initializer; it "
+  "is backed by registers with undefined initial contents">;
+def err_amdgpu_vgpr_bad_storage : Error<
+  "the 'amdgpu_vgpr' attribute requires an automatic, fixed-size local "
+  "variable">;
 def err_attribute_argument_is_zero : Error<
   "%0 attribute must be greater than 0">;
 def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 9cb74ed74f4b9..1d604e547ff70 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -81,6 +81,10 @@ class SemaAMDGPU : public SemaBase {
   void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
 
+  /// Diagnose constraints on an 'amdgpu_vgpr' variable that depend on its
+  /// initializer, once the declaration is complete.
+  void checkAMDGPUVGPRVarDecl(VarDecl *VD);
+
   /// Expand a valid use of the feature identification builtins into its
   /// corresponding sequence of instructions.
   Expr *ExpandAMDGPUPredicateBuiltIn(Expr *CE);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bca2d11d47c6a..570845f64e126 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1603,30 +1603,23 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // building the instruction so that it's there even in no-asserts
       // builds.
       //
-      // "VGPR as memory" objects keep their backing registers only once the
-      // optimizing register allocator runs. At -O0 the backend cannot lower
-      // these accesses (e.g. when the address escapes a basic block), so the
-      // request is not honored: fall back to an ordinary (scratch) alloca and
-      // warn, matching the documented behavior.
-      // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
-      // backing tuple to scratch) so this fallback can be removed.
-      const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+      // A "VGPR as memory" object (amdgpu_vgpr) is register-backed, not on the
+      // stack, so - like LDS/__shared__ - it is emitted as an internal global
+      // in AMDGPUAS::VGPR with a poison initializer (the registers have no
+      // defined initial value). Only in device compilation; on the host (e.g. a
+      // __host__ __device__ function compiled for the host) it falls back to an
+      // ordinary stack alloca.
       const bool UseVGPRMemory =
-          VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
-      if (VGPRAttr && !UseVGPRMemory)
-        CGM.getDiags().Report(D.getLocation(),
-                              diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
-            << VGPRAttr;
-
+          D.hasAttr<AMDGPUVGPRAttr>() && getLangOpts().CUDAIsDevice;
       if (UseVGPRMemory) {
-        // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
-        // address space so that statically indexed accesses lower to vector
-        // register copies instead of scratch memory.
-        auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
-                                        /*ArraySize=*/nullptr, D.getName(),
-                                        AllocaInsertPt->getIterator());
-        AI->setAlignment(allocaAlignment.getAsAlign());
-        AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+        auto *GV = new llvm::GlobalVariable(
+            CGM.getModule(), allocaTy, /*isConstant=*/false,
+            llvm::GlobalValue::InternalLinkage,
+            llvm::PoisonValue::get(allocaTy), getStaticDeclName(CGM, D),
+            /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+            llvm::AMDGPUAS::VGPR);
+        GV->setAlignment(allocaAlignment.getAsAlign());
+        AllocaAddr = RawAddress(GV, allocaTy, allocaAlignment, KnownNonNull);
         address = AllocaAddr;
       } else {
         address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
@@ -1641,9 +1634,9 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
           D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
 
       // Emit a lifetime intrinsic if meaningful. There's no point in doing this
-      // if we don't have a valid insertion point (?). "VGPR as memory" allocas
-      // live in a non-alloca address space, so the standard lifetime markers
-      // (which assume the alloca address space) are skipped for them.
+      // if we don't have a valid insertion point (?). "VGPR as memory" objects
+      // are globals, not allocas, so the stack-slot lifetime markers are
+      // skipped.
       if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
         // If there's a jump into the lifetime of this variable, its lifetime
         // gets broken up into several regions in IR, which requires more work
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 0568ab0b60a07..055bc373a623b 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -22,6 +22,7 @@
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaCUDA.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
@@ -628,18 +629,41 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
 }
 
 void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
-  // The LocalVar subject list already guarantees this is a local variable.
-  // Restrict it further to locals declared directly in a __global__ kernel;
-  // it is meaningless (and an error) in __device__ or host functions.
-  const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
-  if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
-    Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+  // Like __shared__/LDS, this is device-side register storage, so it is allowed
+  // in any device-side function (kernel or __device__) and rejected only in
+  // host code. There is no kernel-only restriction: the backend handles direct
+  // references to the resulting addrspace(13) global from any function (e.g.
+  // ones IPO may introduce), independently of where the attribute was written.
+  if (SemaRef.getLangOpts().CUDA &&
+      SemaRef.CUDA().DiagIfHostCode(AL.getLoc(), diag::err_amdgpu_vgpr_host)
+          << SemaRef.CUDA().CurrentTarget())
     return;
-  }
 
   D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
 }
 
+void SemaAMDGPU::checkAMDGPUVGPRVarDecl(VarDecl *VD) {
+  if (!VD->hasAttr<AMDGPUVGPRAttr>() || VD->isInvalidDecl())
+    return;
+
+  // Only a fixed-size local is register-backed at codegen (the attribute's
+  // LocalVar subject already excludes static-storage locals); a variable-length
+  // array would silently ignore the attribute, so reject it.
+  if (VD->getType()->isVariablyModifiedType()) {
+    Diag(VD->getLocation(), diag::err_amdgpu_vgpr_bad_storage);
+    VD->setInvalidDecl();
+    return;
+  }
+
+  // "VGPR as memory" objects are backed by registers with no defined initial
+  // contents (like __shared__), so they cannot be initialized.
+  if (VD->hasInit()) {
+    Diag(VD->getLocation(), diag::err_amdgpu_vgpr_initializer)
+        << VD->getInit()->getSourceRange();
+    VD->setInvalidDecl();
+  }
+}
+
 static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
                                            Expr *MaxExpr,
                                            const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d45c3eb35094f..40cf988add198 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15342,6 +15342,8 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) {
   if (getLangOpts().CUDA)
     CUDA().checkAllowedInitializer(VD);
 
+  AMDGPU().checkAMDGPUVGPRVarDecl(VD);
+
   // Grab the dllimport or dllexport attribute off of the VarDecl.
   const InheritableAttr *DLLAttr = getDLLAttr(VD);
 
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index a5e0b814c7042..f03aaba8b53dd 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
deleted file mode 100644
index 4d23008b8ef43..0000000000000
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
-// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
-//
-// At -O0 "VGPR as memory" is not honored: the variable falls back to an
-// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
-
-#define __global__ __attribute__((global))
-
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
-__global__ void kernel(int *out, int i) {
-  int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
-  buf[2] = i;
-  out[0] = buf[2];
-}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
new file mode 100644
index 0000000000000..b8618433055cb
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// "VGPR as memory" is honored at every optimization level (it is a global, not
+// an alloca that depends on the optimizing register allocator), so at -O0 the
+// variable is still emitted in addrspace(13) with no diagnostic.
+
+// expected-no-diagnostics
+
+#define __global__ __attribute__((global))
+
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr));
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
index 9a5c38e48951c..712a2121d19a8 100644
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -3,17 +3,30 @@
 // RUN:   | FileCheck %s
 
 #define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
 
-// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
-// memory" address space (addrspace(13)), and its accesses stay in that space.
+// A variable marked amdgpu_vgpr is emitted as an internal global in the "VGPR
+// as memory" address space (addrspace(13)) with a poison initializer (like an
+// LDS/__shared__ variable), and its accesses stay in that space. It is allowed
+// in a __device__ function too, not just a __global__ kernel.
+
+// CHECK-DAG: @{{.*}}kernel{{.*}} = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-DAG: @{{.*}}device{{.*}} = internal addrspace(13) global [4 x i32] poison, align 4
+
+// CHECK-LABEL: define {{.*}}@_Z6devicePii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}device
+__device__ void device(int *out, int i) {
+  int dbuf[4] __attribute__((amdgpu_vgpr));
+  dbuf[1] = i;
+  out[0] = dbuf[1];
+}
 
 // CHECK-LABEL: define {{.*}}@_Z6kernelPii(
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
-// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
-// CHECK: store i32 %{{.*}}, ptr addrspace(13)
-// CHECK: load i32, ptr addrspace(13)
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}kernel
+// CHECK: load i32, ptr addrspace(13) {{.*}}@{{.*}}kernel
 __global__ void kernel(int *out, int i) {
   int buf[4] __attribute__((amdgpu_vgpr));
   buf[2] = i;
   out[0] = buf[2];
+  device(out, i);
 }
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..f120db1aaf6cd 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
 
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 void foo(void) {}
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
index 6ad3074921b9b..05f2d64d56344 100644
--- a/clang/test/SemaCUDA/amdgpu-vgpr.cu
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -8,13 +8,24 @@ __global__ void kernel() {
   (void)ok;
 }
 
-__device__ void device_fn() {
-  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+__global__ void initialized() {
+  // Register-backed storage has undefined initial contents, so (like
+  // __shared__) it cannot be initialized.
+  int bad __attribute__((amdgpu_vgpr)) = 7; // expected-error {{a variable with the 'amdgpu_vgpr' attribute cannot have an initializer}}
+  int arr[2] __attribute__((amdgpu_vgpr)) = {1, 2}; // expected-error {{a variable with the 'amdgpu_vgpr' attribute cannot have an initializer}}
   (void)bad;
+  (void)arr;
+}
+
+__device__ void device_fn() {
+  // Allowed in device functions too (like __shared__); the backend handles
+  // references to the global from non-kernel functions.
+  int ok __attribute__((amdgpu_vgpr)); // OK
+  (void)ok;
 }
 
 __host__ void host_fn() {
-  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' variables are not allowed in __host__ functions}}
   (void)bad;
 }
 
@@ -26,3 +37,12 @@ __global__ void takes_no_args() {
   int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
   (void)bad;
 }
+
+__global__ void bad_storage(int n) {
+  // A static-storage local is not a LocalVar subject; a VLA is rejected as not
+  // fixed-size. Both must avoid silently ignoring the attribute.
+  static int s __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+  int vla[n] __attribute__((amdgpu_vgpr));   // expected-error {{the 'amdgpu_vgpr' attribute requires an automatic, fixed-size local variable}}
+  (void)s;
+  (void)vla;
+}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2b522395ee892..b5b56fe2a1310 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -994,7 +994,7 @@ supported for the ``amdgcn`` target.
      *reserved for future use*             10
      *reserved for future use*             11
      *reserved for downstream use (LLPC)*  12
-     *reserved for future use*             13
+     VGPR as memory                        13              N/A         VGPR             32      0xFFFFFFFF
      *reserved for future use*             14
      *reserved for future use*             16
      Streamout Registers                   128             N/A         GS_REGS
@@ -1104,6 +1104,41 @@ supported for the ``amdgcn`` target.
   When using code object V5 ``LIBOMPTARGET_STACK_SIZE`` may be used to provide the
   private segment size in bytes, for cases where a dynamic stack is used.
 
+**VGPR as memory**
+  The "VGPR as memory" address space holds small objects directly in vector
+  registers instead of scratch (private) memory, avoiding memory traffic for
+  frequently accessed kernel-local data. Objects in this address space are
+  represented as global variables (similar to how *Local* memory uses LDS
+  global variables) and are backed by a block of physical VGPRs that is
+  reserved out of the register allocator for the duration of the function.
+
+  An address in this space is a register-relative dword index into the reserved
+  VGPR block, not a byte address into an addressable memory segment. A load or
+  store at a constant index lowers to a register copy to/from a fixed VGPR; a
+  load or store at a variable (dynamic) index lowers to a hardware register
+  indexing sequence. Sub-dword (8/16-bit) accesses are implemented as
+  read-modify-write of the containing dword.
+
+  An ``addrspacecast`` to or from this address space has no meaningful
+  translation (there is no real address to convert): it is permitted but lowers
+  to ``poison``. ``ptrtoint``/``inttoptr`` are also permitted, but they are
+  *not* poison - the integer is the register-relative byte offset, so an access
+  through an ``inttoptr`` value is lowered as a dynamic (runtime-indexed)
+  access and clamped into the reserved block like any other dynamic index. The
+  verifier still rejects an initializer on such a global variable, atomic
+  accesses, and memory intrinsics
+  (``llvm.memcpy``/``memset``/``memmove``), none of which can be modelled by
+  register storage. The numeric value 13 it uses coincides with the
+  graphics-only ``CONSTANT_BUFFER_5`` alias, which never co-exists with this
+  feature.
+
+  The backing registers have no defined initial contents: reading an object
+  before it is written is undefined behavior and may observe values left in
+  those physical VGPRs by a previously executed wave, just like reading
+  uninitialized scratch or LDS. The reserved block is per-lane and private to
+  the function's call graph; an out-of-range dynamic index is clamped into the
+  block so it cannot read or modify unrelated registers.
+
 **Constant 32-bit**
   *TODO*
 
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index e9d3add54d054..5c536883c6636 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -96,17 +96,25 @@ namespace AMDGPU {
 enum class FlatAddrSpace : unsigned { FLAT, FlatGlobal, FlatScratch };
 
 inline bool isFlatGlobalAddrSpace(unsigned AS) {
+  // AMDGPUAS::VGPR is register-backed, not flat-addressable (see its enum
+  // note).
   return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+         AS == AMDGPUAS::CONSTANT_ADDRESS ||
+         (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
 }
 
 inline bool isExtendedGlobalAddrSpace(unsigned AS) {
+  // AMDGPUAS::VGPR is register-backed, not global (see its enum note).
   return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
-         AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+         (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
 }
 
 inline bool isConstantAddressSpace(unsigned AS) {
+  // AMDGPUAS::VGPR is register-backed read/write storage, not constant memory,
+  // despite aliasing CONSTANT_BUFFER_5 (see its enum note).
+  if (AS == AMDGPUAS::VGPR)
+    return false;
   switch (AS) {
     using namespace AMDGPUAS;
   case CONSTANT_ADDRESS:
@@ -185,6 +193,7 @@ constexpr int64_t getNullPointerValue(unsigned AS) {
   case PRIVATE_ADDRESS:
   case LOCAL_ADDRESS:
   case REGION_ADDRESS:
+  case VGPR:
     return -1;
   default:
     return 0;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3a823f906b012..c753e9e2bf56a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6851,6 +6851,11 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
         Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
       if (!DL.contains("-p9") && !DL.starts_with("p9"))
         Res.append("-p9:192:256:256:32");
+
+      // Add sizing for address space 13 ("VGPR as memory"), 32-bit
+      // register-relative indices.
+      if (!DL.contains("-p13") && !DL.starts_with("p13"))
+        Res.append("-p13:32:32");
     }
 
     // Upgrade the ELF mangling mode.
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index a4e0f531ab1ef..64046f8e53a20 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -807,6 +807,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
         "Global variable is too large to fit into the address space", &GV,
         GVType);
 
+  // Target-specific global variable checks.
+  verifyAMDGPUGlobalVariable(*this, GV);
+
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
     return;
@@ -4568,6 +4571,7 @@ void Verifier::visitLoadInst(LoadInst &LI) {
           ElTy, &LI);
 
     checkAtomicMemAccessSize(ElTy, &LI);
+    verifyAMDGPUAtomicAccess(*this, LI.getPointerAddressSpace(), &LI);
   } else {
     Check(LI.getSyncScopeID() == SyncScope::System,
           "Non-atomic load cannot have SynchronizationScope specified", &LI);
@@ -4596,6 +4600,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
           "point, or vector type!",
           ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
+    verifyAMDGPUAtomicAccess(*this, SI.getPointerAddressSpace(), &SI);
   } else {
     Check(SI.getSyncScopeID() == SyncScope::System,
           "Non-atomic store cannot have SynchronizationScope specified", &SI);
@@ -4674,6 +4679,7 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
   Check(ElTy->isIntOrPtrTy(),
         "cmpxchg operand must have integer or pointer type", ElTy, &CXI);
   checkAtomicMemAccessSize(ElTy, &CXI);
+  verifyAMDGPUAtomicAccess(*this, CXI.getPointerAddressSpace(), &CXI);
   visitInstruction(CXI);
 }
 
@@ -4713,6 +4719,7 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   checkAtomicMemAccessSize(ElTy, &RMWI);
   Check(AtomicRMWInst::FIRST_BINOP <= Op && Op <= AtomicRMWInst::LAST_BINOP,
         "Invalid binary operation!", &RMWI);
+  verifyAMDGPUAtomicAccess(*this, RMWI.getPointerAddressSpace(), &RMWI);
   visitInstruction(RMWI);
 }
 
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index de9a0c7bef132..bbf52af565b20 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
@@ -122,10 +123,37 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
   if (!VS.TT.isAMDGPU())
     return;
 
-  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
-      AI.getAddressSpace() != AMDGPUAS::VGPR)
-    VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
-                   &AI);
+  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+}
+
+void llvm::verifyAMDGPUGlobalVariable(VerifierSupport &VS,
+                                      const GlobalVariable &GV) {
+  if (!VS.TT.isAMDGPU())
+    return;
+
+  if (GV.getAddressSpace() != AMDGPUAS::VGPR)
+    return;
+
+  // "VGPR as memory" objects are backed by registers, which have no defined
+  // initial contents, so (like LDS) they cannot be statically initialized: the
+  // only permitted initializer is an undef/poison placeholder (isa<UndefValue>
+  // also matches poison).
+  Check(!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer()),
+        "global variable in the VGPR address space (13) cannot have an "
+        "initializer",
+        &GV);
+}
+
+void llvm::verifyAMDGPUAtomicAccess(VerifierSupport &VS, unsigned AS,
+                                    const Value *V) {
+  if (!VS.TT.isAMDGPU())
+    return;
+
+  // "VGPR as memory" is per-lane register storage, so an atomic access to it is
+  // meaningless and unsupported.
+  Check(AS != AMDGPUAS::VGPR,
+        "atomic operations on the VGPR address space (13) are not allowed", V);
 }
 
 bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
@@ -139,6 +167,21 @@ bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
 
 void llvm::verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
                                      CallBase &Call) {
+  // No intrinsic models "VGPR as memory" (only plain load/store is supported),
+  // so an addrspace(13) pointer argument - to a memory intrinsic, masked
+  // load/store, gather/scatter, ptrmask, etc. - would be mishandled.
+  if (VS.TT.isAMDGPU())
+    for (const Value *Op : Call.args()) {
+      Type *T = Op->getType();
+      if (T->isPtrOrPtrVectorTy() &&
+          T->getPointerAddressSpace() == AMDGPUAS::VGPR) {
+        VS.CheckFailed("intrinsic with a VGPR address space (13) pointer "
+                       "argument is not allowed",
+                       &Call);
+        break;
+      }
+    }
+
   switch (ID) {
   default:
     return;
diff --git a/llvm/lib/IR/VerifierInternal.h b/llvm/lib/IR/VerifierInternal.h
index 70d1521475198..94ac405f10885 100644
--- a/llvm/lib/IR/VerifierInternal.h
+++ b/llvm/lib/IR/VerifierInternal.h
@@ -221,6 +221,10 @@ void verifyAMDGPUFunctionMetadata(VerifierSupport &VS, const Function &F);
 
 void verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI);
 
+void verifyAMDGPUGlobalVariable(VerifierSupport &VS, const GlobalVariable &GV);
+
+void verifyAMDGPUAtomicAccess(VerifierSupport &VS, unsigned AS, const Value *V);
+
 void verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
                                CallBase &Call);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index d19333f14ee63..2f3ae2ad7ecbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
+FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
@@ -276,18 +276,15 @@ struct AMDGPUPromoteAllocaPass
   TargetMachine &TM;
 };
 
-void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+void initializeAMDGPUPrivateObjectVGPRsLegacyPass(PassRegistry &);
 extern char &AMDGPUPrivateObjectVGPRsID;
 
-// Allocates pre-existing VGPR address space allocas without performing any
-// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
-// objects remain functional.
-struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
-  AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ModulePass *createAMDGPULowerModuleVGPRsPass();
+void initializeAMDGPULowerModuleVGPRsPass(PassRegistry &);
+extern char &AMDGPULowerModuleVGPRsID;
 
-private:
-  TargetMachine &TM;
+struct AMDGPULowerModuleVGPRsPass : PassInfoMixin<AMDGPULowerModuleVGPRsPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
 struct AMDGPUPromoteAllocaToVectorPass
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e289058a2ed1..7330f3b13f3cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,10 +21,8 @@
 #include "R600RegisterInfo.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -343,159 +341,25 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
-// Resolve the constant byte offset within the per-function VGPR file for a
-// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
-// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
-// object.
-static std::optional<unsigned>
-getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
-  unsigned ExtraOffset = 0;
-  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
-    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
-      ExtraOffset = C->getZExtValue();
-      Ptr = Ptr.getOperand(0);
-    }
-  }
-  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
-  if (!FI)
-    return std::nullopt;
-  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
-  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
-    return std::nullopt;
-  return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
-}
-
-// Lower a load/store of a "VGPR as memory" object into one
-// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
-// offset. The pseudos are later expanded into subregister copies by
-// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
-// split into their dword lanes; sub-dword and non-dword-multiple accesses are
-// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
-// true if \p N was rewritten.
-bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
-  if (auto *Load = dyn_cast<LoadSDNode>(N)) {
-    if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
-        Load->getExtensionType() != ISD::NON_EXTLOAD)
-      return false;
-    EVT VT = Load->getValueType(0);
-    unsigned Bits = VT.getFixedSizeInBits();
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    std::optional<unsigned> Offset =
-        getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
-    if (!Offset || (*Offset % 4 != 0))
-      return false;
-
-    SDLoc DL(N);
-    unsigned NumDwords = Bits / 32;
-    SmallVector<SDValue, 4> Dwords;
-    SmallVector<SDValue, 4> Chains;
-    for (unsigned I = 0; I != NumDwords; ++I) {
-      SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
-                       Load->getChain()};
-      MachineSDNode *Lane = CurDAG->getMachineNode(
-          AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
-      if (I == 0)
-        CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
-      Dwords.push_back(SDValue(Lane, 0));
-      Chains.push_back(SDValue(Lane, 1));
-    }
-
-    SDValue Val;
-    if (NumDwords == 1) {
-      Val = Dwords[0];
-      if (VT != MVT::i32)
-        Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
-    } else {
-      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
-      SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
-      Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
-    }
-    SDValue Chain = NumDwords == 1 ? Chains[0]
-                                   : CurDAG->getNode(ISD::TokenFactor, DL,
-                                                     MVT::Other, Chains);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
-    return true;
-  }
-
-  if (auto *Store = dyn_cast<StoreSDNode>(N)) {
-    if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
-        Store->isTruncatingStore())
-      return false;
-    SDValue Val = Store->getValue();
-    EVT VT = Val.getValueType();
-    unsigned Bits = VT.getFixedSizeInBits();
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    std::optional<unsigned> Offset =
-        getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
-    if (!Offset || (*Offset % 4 != 0))
-      return false;
-
-    SDLoc DL(N);
-    unsigned NumDwords = Bits / 32;
-    SmallVector<SDValue, 4> Dwords;
-    if (NumDwords == 1) {
-      if (VT != MVT::i32)
-        Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
-      Dwords.push_back(Val);
-    } else {
-      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
-      SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
-      for (unsigned I = 0; I != NumDwords; ++I)
-        Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
-                                         Vec,
-                                         CurDAG->getConstant(I, DL, MVT::i32)));
-    }
-
-    SmallVector<SDValue, 4> Chains;
-    for (unsigned I = 0; I != NumDwords; ++I) {
-      SDValue Ops[] = {Dwords[I],
-                       CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
-                       Store->getChain()};
-      MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
-                                                   DL, MVT::Other, Ops);
-      if (I == 0)
-        CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
-      Chains.push_back(SDValue(Lane, 0));
-    }
-    SDValue Chain = NumDwords == 1 ? Chains[0]
-                                   : CurDAG->getNode(ISD::TokenFactor, DL,
-                                                     MVT::Other, Chains);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
-    return true;
-  }
-
-  return false;
-}
-
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  bool MadeChange = false;
+  if (!Subtarget->d16PreservesUnusedBits())
+    return;
 
-  // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
-  // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
-  SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
-  while (VGPRPos != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--VGPRPos;
-    MadeChange |= rewriteVGPRFrameAccess(N);
-  }
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
-  if (Subtarget->d16PreservesUnusedBits()) {
-    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
-    while (Position != CurDAG->allnodes_begin()) {
-      SDNode *N = &*--Position;
-      if (N->use_empty())
-        continue;
-
-      switch (N->getOpcode()) {
-      case ISD::BUILD_VECTOR:
-        // TODO: Match load d16 from shl (extload:i16), 16
-        MadeChange |= matchLoadD16FromBuildVector(N);
-        break;
-      default:
-        break;
-      }
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    if (N->use_empty())
+      continue;
+
+    switch (N->getOpcode()) {
+    case ISD::BUILD_VECTOR:
+      // TODO: Match load d16 from shl (extload:i16), 16
+      MadeChange |= matchLoadD16FromBuildVector(N);
+      break;
+    default:
+      break;
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index cf62874912742..95f85a6151375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,7 +67,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   bool matchLoadD16FromBuildVector(SDNode *N) const;
-  bool rewriteVGPRFrameAccess(SDNode *N);
   void PreprocessISelDAG() override;
   void Select(SDNode *N) override;
   void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
new file mode 100644
index 0000000000000..25df6c9283bd3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
@@ -0,0 +1,315 @@
+//===- AMDGPULowerModuleVGPRs.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lays out a module's "VGPR as memory" (addrspace(13)) globals into one shared
+// register "file" and records where it lives on every function whose call graph
+// uses it.
+//
+// The file is backed by a fixed block of physical VGPRs, so for an address into
+// it to be meaningful across calls every function in the call graph must agree
+// on (a) each global's byte offset and (b) the file's base register. The
+// backend can derive a base per function (just above its ABI inputs) but those
+// differ, so (b) is resolved module-wide:
+//
+//  * Offsets: all globals are packed into one deterministic layout; each
+//    global's byte offset is recorded as "amdgpu.vgpr.memory.offset" metadata.
+//  * Base: one index, the max ABI-input VGPR boundary over all participating
+//    functions, so it clears every function's inputs yet stays as low as
+//    possible to preserve occupancy.
+//
+// The file size and base are attached as the "amdgpu-vgpr-memory-size" and
+// "amdgpu-vgpr-memory-base" attributes to every function whose call graph uses
+// the file (like LDS, it is live for a using kernel's whole execution, so all
+// reachable functions must reserve it). The backend consumes these:
+// SIISelLowering reads the offset metadata; SIMachineFunctionInfo reads the
+// attributes; SIRegisterInfo::getVGPRMemoryFile reserves [base, base + size).
+//
+// TODO: one module-wide layout makes every using function reserve all globals,
+// and a function reachable from several kernels reserve the file even for a
+// kernel that does not use it. A per-kernel layout (as AMDGPULowerModuleLDS
+// does, with a table for shared callees) would tighten this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-module-vgprs"
+
+namespace {
+
+constexpr char SizeAttr[] = "amdgpu-vgpr-memory-size";
+constexpr char BaseAttr[] = "amdgpu-vgpr-memory-base";
+constexpr char OffsetMD[] = "amdgpu.vgpr.memory.offset";
+
+// The fixed device-function ABI keeps the work-item ID in this register
+// (SITargetLowering::allocateSpecialInputVGPRsFixed). The shared file must not
+// overlap it.
+constexpr unsigned FixedWorkitemRegIdx = 31;
+
+// True if F may read the work-item ID (and so needs its work-item-ID input
+// register), per the amdgpu-no-workitem-id-* attributes.
+static bool usesWorkitemID(const Function &F) {
+  return !F.hasFnAttribute("amdgpu-no-workitem-id-x") ||
+         !F.hasFnAttribute("amdgpu-no-workitem-id-y") ||
+         !F.hasFnAttribute("amdgpu-no-workitem-id-z");
+}
+
+// Upper bound on the low contiguous VGPRs occupied by F's ABI inputs - the
+// registers the shared file must sit above. The fixed device-function ABI also
+// keeps the work-item ID in the high register v31 (see usesFixedWorkitemReg);
+// that is modelled separately, not counted here.
+static unsigned inputVGPRBound(const Function &F) {
+  // Compute kernels take args in the kernarg segment, not VGPRs; their only
+  // VGPR input is the work-item ID, packed into a single low register.
+  if (AMDGPU::isKernel(F.getCallingConv()))
+    return usesWorkitemID(F) ? 1 : 0;
+
+  // Graphics entry points and ordinary functions pass their arguments in VGPRs
+  // (except inreg arguments, which go in SGPRs).
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned N = 0;
+  for (const Argument &A : F.args()) {
+    if (A.hasAttribute(Attribute::InReg))
+      continue;
+    unsigned Dwords =
+        divideCeil(DL.getTypeAllocSize(A.getType()).getFixedValue(), 4u);
+    // A multi-dword argument tuple is even-aligned on targets that require
+    // aligned VGPR tuples. Model that gap conservatively so the shared base
+    // never lands below such an argument register (the backend's overlap check
+    // in getVGPRMemoryFile is the backstop if this is ever too low).
+    if (Dwords > 1)
+      N = alignTo(N, 2u);
+    N += Dwords;
+  }
+  return N;
+}
+
+// True if F is a callable (non-entry) device function on the default ABI, which
+// keeps the work-item ID in the fixed high register v31. The shared file must
+// not overlap v31 in such a function.
+static bool usesFixedWorkitemReg(const Function &F) {
+  CallingConv::ID CC = F.getCallingConv();
+  return !AMDGPU::isEntryFunctionCC(CC) && !AMDGPU::isGraphics(CC) &&
+         usesWorkitemID(F);
+}
+
+class AMDGPULowerModuleVGPRs : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerModuleVGPRs() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override { return "AMDGPU Lower Module VGPRs"; }
+};
+
+} // end anonymous namespace
+
+char AMDGPULowerModuleVGPRs::ID = 0;
+char &llvm::AMDGPULowerModuleVGPRsID = AMDGPULowerModuleVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleVGPRs, DEBUG_TYPE, "AMDGPU Lower Module VGPRs",
+                false, false)
+
+ModulePass *llvm::createAMDGPULowerModuleVGPRsPass() {
+  return new AMDGPULowerModuleVGPRs();
+}
+
+static bool lowerModuleVGPRs(Module &M) {
+  SmallVector<GlobalVariable *, 8> Globals;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::VGPR)
+      Globals.push_back(&GV);
+  if (Globals.empty())
+    return false;
+
+  // In one walk over each defined function, map it to the addrspace(13) globals
+  // it directly references and collect its ordinary calls (non-intrinsic,
+  // non-inline-asm) for the later reserved-register-clobber check, so the
+  // module is not traversed twice.
+  DenseMap<Function *, SmallVector<GlobalVariable *, 2>> Uses;
+  DenseMap<Function *, SmallVector<const CallBase *, 2>> Calls;
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    SmallPtrSet<GlobalVariable *, 4> Seen;
+    for (Instruction &I : instructions(F)) {
+      if (const auto *CB = dyn_cast<CallBase>(&I))
+        if (!CB->isInlineAsm() &&
+            CB->getIntrinsicID() == Intrinsic::not_intrinsic)
+          Calls[&F].push_back(CB);
+      for (Value *Op : I.operands()) {
+        // Only pointer operands can name a global; skipping the rest avoids a
+        // getUnderlyingObject call per non-pointer operand on every compile.
+        if (!Op->getType()->isPtrOrPtrVectorTy())
+          continue;
+        // getUnderlyingObject sees through constant-expression GEPs/casts, so
+        // a global referenced via e.g. getelementptr(@g, off) is found.
+        auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Op));
+        if (GV && GV->getAddressSpace() == AMDGPUAS::VGPR &&
+            Seen.insert(GV).second)
+          Uses[&F].push_back(GV);
+      }
+    }
+  }
+  if (Uses.empty())
+    return true; // nothing references the file
+
+  CallGraph CG(M);
+  auto Reachable = [&](Function *Root, SmallPtrSetImpl<Function *> &Out) {
+    SmallVector<Function *, 16> Work{Root};
+    while (!Work.empty()) {
+      Function *F = Work.pop_back_val();
+      if (!Out.insert(F).second)
+        continue;
+      if (CallGraphNode *N = CG[F])
+        for (auto &CR : *N)
+          if (Function *Callee = CR.second->getFunction())
+            if (!Callee->isDeclaration())
+              Work.push_back(Callee);
+    }
+  };
+
+  // Partition functions and globals into independent layout groups: a group
+  // covers everything reachable from a using kernel (the file is live for its
+  // whole execution, like LDS) plus every function that uses each global. So
+  // disjoint kernels get independent (low, occupancy-friendly) bases while
+  // shared functions stay in one group. Functions and globals are both
+  // GlobalValues, so one union-find covers both.
+  EquivalenceClasses<const GlobalValue *> Groups;
+  for (auto &[F, GVs] : Uses)
+    for (GlobalVariable *GV : GVs)
+      Groups.unionSets(F, GV);
+
+  // Functions reachable from each file-using kernel join that kernel's group
+  // (so they reserve the file), and kernels sharing any callee merge.
+  for (Function &K : M) {
+    if (K.isDeclaration() || !AMDGPU::isEntryFunctionCC(K.getCallingConv()))
+      continue;
+    SmallPtrSet<Function *, 16> R;
+    Reachable(&K, R);
+    if (llvm::none_of(R, [&](Function *F) { return Uses.count(F); }))
+      continue; // this kernel does not use the file
+    for (Function *F : R)
+      Groups.unionSets(&K, F);
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+  LLVMContext &Ctx = M.getContext();
+  Type *I32 = Type::getInt32Ty(Ctx);
+
+  // Lay out each group independently.
+  for (auto It = Groups.begin(), E = Groups.end(); It != E; ++It) {
+    const auto *Leader = *It;
+    if (!Leader->isLeader())
+      continue;
+    SmallVector<GlobalVariable *, 8> GroupGlobals;
+    SmallVector<Function *, 16> GroupFns;
+    for (auto MI = Groups.member_begin(*Leader); MI != Groups.member_end();
+         ++MI) {
+      const GlobalValue *GV = *MI;
+      if (auto *G = dyn_cast<GlobalVariable>(GV))
+        GroupGlobals.push_back(const_cast<GlobalVariable *>(G));
+      else
+        GroupFns.push_back(const_cast<Function *>(cast<Function>(GV)));
+    }
+    if (GroupGlobals.empty() || GroupFns.empty())
+      continue;
+
+    // Deterministic packed layout (sorted by name).
+    llvm::stable_sort(GroupGlobals, [](GlobalVariable *A, GlobalVariable *B) {
+      return A->getName() < B->getName();
+    });
+    unsigned Size = 0;
+    for (GlobalVariable *GV : GroupGlobals) {
+      Align A = std::max(
+          DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()),
+          Align(4));
+      unsigned Offset = alignTo(Size, A);
+      GV->setMetadata(OffsetMD,
+                      MDNode::get(Ctx, {ConstantAsMetadata::get(
+                                           ConstantInt::get(I32, Offset))}));
+      Size = Offset + DL.getTypeAllocSize(GV->getValueType()).getFixedValue();
+    }
+
+    // One base for the group: above every member's low ABI-input VGPRs,
+    // even-aligned.
+    unsigned Base = 0;
+    bool ClearsFixedWorkitem = false;
+    for (Function *F : GroupFns) {
+      Base = std::max(Base, inputVGPRBound(*F));
+      ClearsFixedWorkitem |= usesFixedWorkitemReg(*F);
+    }
+    Base = alignTo(Base, 2u);
+
+    // The fixed device-function ABI keeps the work-item ID in v31. A small file
+    // sits below it; if the file would grow into v31, place it above instead
+    // (at an occupancy cost) so it never overlaps that input.
+    unsigned Dwords = AMDGPU::getVGPRMemoryFileDwords(Size);
+    if (ClearsFixedWorkitem && Base <= FixedWorkitemRegIdx &&
+        Base + Dwords > FixedWorkitemRegIdx)
+      Base = alignTo(FixedWorkitemRegIdx + 1, 2u);
+
+    // The file lives in low, caller-saved VGPRs that only group members
+    // reserve. A call to anything outside the group - indirect, external, or a
+    // defined non-member - does not reserve the file and clobbers it, so
+    // diagnose rather than silently corrupt it. (Direct calls between members
+    // are safe; intrinsics don't clobber.) Calls introduced after this pass
+    // (e.g. expanded libcalls) and inline asm clobbering a file register are
+    // caught later, in AMDGPUPrivateObjectVGPRs, where the machine-level calls
+    // and the final reserved registers are known.
+    SmallPtrSet<const Function *, 16> GroupFnSet(GroupFns.begin(),
+                                                 GroupFns.end());
+    for (Function *F : GroupFns)
+      for (const CallBase *CB : Calls.lookup(F)) {
+        const Function *Callee = CB->getCalledFunction();
+        if (!Callee || !GroupFnSet.contains(Callee))
+          Ctx.diagnose(DiagnosticInfoUnsupported(
+              *F,
+              "'VGPR as memory' is not supported in a function that makes an "
+              "indirect call or a call outside its call graph",
+              CB->getDebugLoc()));
+      }
+
+    for (Function *F : GroupFns) {
+      F->addFnAttr(SizeAttr, utostr(Size));
+      F->addFnAttr(BaseAttr, utostr(Base));
+    }
+  }
+  return true;
+}
+
+bool AMDGPULowerModuleVGPRs::runOnModule(Module &M) {
+  return lowerModuleVGPRs(M);
+}
+
+PreservedAnalyses AMDGPULowerModuleVGPRsPass::run(Module &M,
+                                                  ModuleAnalysisManager &) {
+  return lowerModuleVGPRs(M) ? PreservedAnalyses::none()
+                             : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 376a1ebcc4256..38f53b6365207 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -23,6 +23,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
 MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-lower-module-vgprs", AMDGPULowerModuleVGPRsPass())
 MODULE_PASS("amdgpu-perf-hint",
             AMDGPUPerfHintAnalysisPass(
               *static_cast<const GCNTargetMachine *>(this)))
@@ -67,7 +68,6 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
-FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
@@ -126,6 +126,7 @@ MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
 MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
 MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
+MACHINE_FUNCTION_PASS("amdgpu-private-object-vgprs", AMDGPUPrivateObjectVGPRsPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index a3a1cf6f18bed..8e88cf60e3dbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -7,30 +7,32 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
-/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
-/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
-/// carries a constant byte offset, which selects the dword (subregister) to
-/// copy.
+/// Lowers the constant-index SI_VGPR_FRAME_{LOAD,STORE} pseudos for "VGPR as
+/// memory" objects (addrspace(13)) into register copies to/from the block of
+/// physical VGPRs backing the file: a load is a COPY from the file register, a
+/// store a COPY to it.
 ///
-/// This runs once the function is out of SSA form (so the single backing tuple
-/// can be defined by several subregister copies) and while LiveIntervals is
-/// available. The backing tuple has lane-divergent liveness (its subregisters
-/// are written and read independently), which the whole-register LiveVariables
-/// analysis cannot represent; the pass therefore updates the subregister-aware
-/// LiveIntervals directly.
+/// The file is a fixed block of VGPRs (SIRegisterInfo::getVGPRMemoryFile)
+/// reserved out of allocation (getReservedRegs) and counted in the VGPR usage
+/// (AMDGPUResourceUsageAnalysis). It sits just above the ABI inputs at a base
+/// AMDGPULowerModuleVGPRs shares across the call graph (so an address resolves
+/// to the same registers everywhere), low enough to cost only its own size
+/// rather than pinning occupancy. This pass runs after register allocation;
+/// until then the pseudos behave as opaque memory operations, so allocation is
+/// free to use any other register for the surrounding code.
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUPrivateObjectVGPRs.h"
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -38,13 +40,66 @@ using namespace llvm;
 
 namespace {
 
-class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+// These two switches must list the same widths as the SI_VGPR_FRAME_{LOAD,
+// STORE}_B* `foreach` in SIInstructions.td.
+static bool isVGPRFrameLoad(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B32:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B64:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B96:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B128:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B160:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B192:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B224:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B256:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B288:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B320:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B352:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B384:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B512:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B1024:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool isVGPRFrameStore(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::SI_VGPR_FRAME_STORE_B32:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B64:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B96:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B128:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B160:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B192:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B224:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B256:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B288:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B320:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B352:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B384:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B512:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B1024:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class AMDGPUPrivateObjectVGPRs {
+public:
+  bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrivateObjectVGPRsLegacy : public MachineFunctionPass {
 public:
   static char ID;
 
-  AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+  AMDGPUPrivateObjectVGPRsLegacy() : MachineFunctionPass(ID) {}
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    return AMDGPUPrivateObjectVGPRs().run(MF);
+  }
 
   StringRef getPassName() const override {
     return "AMDGPU Private Object VGPRs";
@@ -52,94 +107,146 @@ class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<LiveIntervalsWrapperPass>();
-    AU.addPreserved<LiveIntervalsWrapperPass>();
-    AU.addPreserved<SlotIndexesWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRsLegacy, DEBUG_TYPE,
                 "AMDGPU Private Object VGPRs", false, false)
 
-char AMDGPUPrivateObjectVGPRs::ID = 0;
+char AMDGPUPrivateObjectVGPRsLegacy::ID = 0;
 
-char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRsLegacy::ID;
 
-bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+AMDGPUPrivateObjectVGPRsPass::run(MachineFunction &MF,
+                                  MachineFunctionAnalysisManager &MFAM) {
+  if (!AMDGPUPrivateObjectVGPRs().run(MF))
+    return PreservedAnalyses::all();
+  return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
+}
+
+bool AMDGPUPrivateObjectVGPRs::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  // Collect the pseudos and determine how many dwords the backing tuple needs.
-  SmallVector<MachineInstr *, 8> Worklist;
-  unsigned NumDwords = 0;
+  // The file is a fixed block of reserved physical VGPRs (getVGPRMemoryFile):
+  // exempt from liveness, needing no explicit def, and at the same (shared)
+  // registers across the call graph.
+  auto [BaseIdx, FileDwords] = TRI->getVGPRMemoryFile(MF);
+  if (FileDwords == 0)
+    return false;
+
+  const TargetRegisterClass &VGPR32 = AMDGPU::VGPR_32RegClass;
+
+  // The file lives in low, caller-saved VGPRs. AMDGPULowerModuleVGPRs diagnoses
+  // calls that escape the group at the IR level, but later passes (e.g.
+  // AtomicExpand, CodeGenPrepare) can introduce libcalls, and inline asm naming
+  // a file register is not seen there at all. Both would clobber the file, so
+  // catch them here, now that the reserved registers and machine calls are
+  // final.
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto FileOverlaps = [&](Register Reg) {
+    for (unsigned I = 0; I != FileDwords; ++I)
+      if (TRI->regsOverlap(Reg, VGPR32.getRegister(BaseIdx + I)))
+        return true;
+    return false;
+  };
+  auto RegMaskClobbersFile = [&](const MachineOperand &MO) {
+    for (unsigned I = 0; I != FileDwords; ++I)
+      if (MO.clobbersPhysReg(VGPR32.getRegister(BaseIdx + I)))
+        return true;
+    return false;
+  };
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
-      unsigned Opc = MI.getOpcode();
-      if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
-          Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+      if (MI.isInlineAsm()) {
+        // A clobber surfaces either as an explicit physical-register def or, for
+        // some forms, as a register-mask operand; check both.
+        for (const MachineOperand &MO : MI.operands())
+          if ((MO.isReg() && MO.getReg().isPhysical() && MO.isDef() &&
+               FileOverlaps(MO.getReg())) ||
+              (MO.isRegMask() && RegMaskClobbersFile(MO))) {
+            Ctx.diagnose(DiagnosticInfoUnsupported(
+                MF.getFunction(),
+                "inline asm clobbers a 'VGPR as memory' reserved register",
+                MI.getDebugLoc()));
+            break;
+          }
         continue;
-      unsigned ByteOffset = MI.getOperand(1).getImm();
-      NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
-      Worklist.push_back(&MI);
+      }
+      // A call clobbers caller-saved VGPRs, including the file, unless the
+      // callee reserves the same file: an in-group member (which carries the
+      // size attribute) or this function itself (self-recursion). Anything else
+      // - an out-of-group/external callee, or an indirect call with no
+      // resolvable callee - does not preserve it. AMDGPULowerModuleVGPRs catches
+      // IR-level escapes; this also covers calls introduced after it (e.g.
+      // expanded libcalls) and indirect machine calls it could not see.
+      if (MI.isCall()) {
+        const MachineOperand *CalleeOp =
+            TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+        const auto *Callee =
+            CalleeOp && CalleeOp->isGlobal()
+                ? dyn_cast<Function>(
+                      CalleeOp->getGlobal()->stripPointerCastsAndAliases())
+                : nullptr;
+        if (Callee == &MF.getFunction() ||
+            (Callee && Callee->hasFnAttribute("amdgpu-vgpr-memory-size")))
+          continue;
+        Ctx.diagnose(DiagnosticInfoUnsupported(
+            MF.getFunction(),
+            "call to a function that clobbers the 'VGPR as memory' reserved "
+            "file",
+            MI.getDebugLoc()));
+      }
     }
   }
 
-  if (Worklist.empty())
-    return false;
+  bool Changed = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+      unsigned Opc = MI.getOpcode();
+      bool IsLoad = isVGPRFrameLoad(Opc);
+      if (!IsLoad && !isVGPRFrameStore(Opc))
+        continue;
 
-  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-
-  const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
-  assert(RC && "no VGPR register class for VGPR-as-memory object");
-  Register Storage = MRI.createVirtualRegister(RC);
-
-  // Define the whole tuple up front so partial (subregister) writes and reads
-  // of uninitialized lanes are well formed.
-  MachineBasicBlock &Entry = MF.front();
-  MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
-                                 TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
-  LIS->InsertMachineInstrInMaps(*ImpDef);
-
-  for (MachineInstr *MI : Worklist) {
-    MachineBasicBlock &MBB = *MI->getParent();
-    const DebugLoc &DL = MI->getDebugLoc();
-    unsigned Dword = MI->getOperand(1).getImm() / 4;
-    unsigned SubReg = NumDwords == 1
-                          ? AMDGPU::NoSubRegister
-                          : SIRegisterInfo::getSubRegFromChannel(Dword);
-
-    MachineInstr *Copy;
-    if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
-      Register Dst = MI->getOperand(0).getReg();
-      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
-                 .addReg(Storage, {}, SubReg);
-    } else {
-      Register Src = MI->getOperand(0).getReg();
-      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
-                 .addReg(Storage, RegState::Define, SubReg)
-                 .addReg(Src);
+      const DebugLoc &DL = MI.getDebugLoc();
+      unsigned Dword = MI.getOperand(1).getImm();
+      Register Data = MI.getOperand(0).getReg();
+      unsigned AccessDwords = TRI->getRegSizeInBits(Data, MRI) / 32;
+
+      // Bounds-checked at pseudo creation (LowerLoadStoreVGPR); never name a
+      // register outside the reserved file.
+      assert(Dword + AccessDwords <= FileDwords &&
+             "VGPR-as-memory access outside the reserved file");
+
+      // Copy the access dword-by-dword between the data (sub)registers and the
+      // file registers. Doing it per dword rather than as one tuple COPY avoids
+      // needing an aligned physical VGPR tuple for the file slice, which can
+      // start on an odd register on targets that require aligned tuples.
+      for (unsigned I = 0; I != AccessDwords; ++I) {
+        MCRegister FileReg = VGPR32.getRegister(BaseIdx + Dword + I);
+        Register DataReg =
+            AccessDwords == 1
+                ? Data
+                : Register(TRI->getSubReg(
+                      Data, SIRegisterInfo::getSubRegFromChannel(I)));
+        if (IsLoad)
+          BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), DataReg)
+              .addReg(FileReg);
+        else
+          BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), FileReg)
+              .addReg(DataReg);
+      }
+
+      MI.eraseFromParent();
+      Changed = true;
     }
-    // The copy takes the pseudo's slot, so the intervals of the copied
-    // load/store operand stay valid.
-    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
-    MI->eraseFromParent();
   }
 
-  // The backing tuple is brand new; compute its (subregister) live interval.
-  LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
-
-  // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
-  // form disconnected value-number components within the single tuple, which an
-  // individual live interval must not contain. Split them into separate
-  // virtual registers, exactly as the register coalescer does for the intervals
-  // it leaves behind.
-  SmallVector<LiveInterval *, 4> SplitLIs;
-  LIS->splitSeparateComponents(LI, SplitLIs);
-
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
new file mode 100644
index 0000000000000..f18b9a608970d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrivateObjectVGPRs.h -------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrivateObjectVGPRsPass
+    : public PassInfoMixin<AMDGPUPrivateObjectVGPRsPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index c587302c3bbae..2223b9d036fa1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -139,7 +138,6 @@ class AMDGPUPromoteAllocaImpl {
   unsigned MaxVGPRs;
   unsigned VGPRBudgetRatio;
   unsigned MaxVectorRegs;
-  unsigned AllocVGPROffset = 0;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -164,10 +162,6 @@ class AMDGPUPromoteAllocaImpl {
   void analyzePromoteToVector(AllocaAnalysis &AA) const;
   void promoteAllocaToVector(AllocaAnalysis &AA);
   void analyzePromoteToLDS(AllocaAnalysis &AA) const;
-
-  /// Allocate an alloca that already lives in the VGPR address space to a range
-  /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
-  void allocateVgprs(AllocaAnalysis &AA);
   bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
                              SetVector<IntrinsicInst *> &DeferredIntrs);
   void
@@ -185,11 +179,7 @@ class AMDGPUPromoteAllocaImpl {
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
   }
 
-  /// IsLatePass is true when invoked as a codegen pass and false when invoked
-  /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
-  /// requests only the work strictly required for functionality (i.e. VGPR
-  /// allocation), skipping the optimization-oriented promotions.
-  bool run(Function &F, bool IsLatePass, bool NoOpt);
+  bool run(Function &F, bool PromoteToLDS);
 };
 
 // FIXME: This can create globals so should be a module pass.
@@ -197,34 +187,26 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 public:
   static char ID;
 
-  explicit AMDGPUPromoteAlloca(
-      CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
-      : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
       return AMDGPUPromoteAllocaImpl(
                  TPC->getTM<TargetMachine>(), *F.getParent(),
                  getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
-          .run(F, /*IsLatePass=*/true, NoOpt);
-    }
+          .run(F, /*PromoteToLDS*/ true);
     return false;
   }
 
-  StringRef getPassName() const override {
-    return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
-  }
+  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
-
-private:
-  bool NoOpt;
 };
 
 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -269,7 +251,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
   bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
-                     .run(F, /*IsLatePass=*/true, /*NoOpt=*/false);
+                     .run(F, /*PromoteToLDS=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -282,20 +264,7 @@ PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
   bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
-                     .run(F, /*IsLatePass=*/false, /*NoOpt=*/false);
-  if (Changed) {
-    PreservedAnalyses PA;
-    PA.preserveSet<CFGAnalyses>();
-    return PA;
-  }
-  return PreservedAnalyses::all();
-}
-
-PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
-                                              FunctionAnalysisManager &AM) {
-  auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
-                     .run(F, /*IsLatePass=*/true, /*NoOpt=*/true);
+                     .run(F, /*PromoteToLDS=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -304,8 +273,8 @@ PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
   return PreservedAnalyses::all();
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
-  return new AMDGPUPromoteAlloca(OptLevel);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+  return new AMDGPUPromoteAlloca();
 }
 
 bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -398,121 +367,14 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
     VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
 }
 
-// A "VGPR as memory" object can only be realized in registers today when every
-// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
-// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses need gfx13 support, which is not yet
-// available; such objects fall back to scratch instead.
-//
-// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
-// support so this fallback is no longer needed.
-static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
-                                            const DataLayout &DL) {
-  // An access is lowerable if it covers a whole number of dwords and starts at
-  // a dword-aligned constant offset from the alloca.
-  auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
-    if (!Simple)
-      return false;
-    uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
-    const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
-        DL, Off, /*AllowNonInbounds=*/true);
-    return Base == &AI && Off.urem(4) == 0;
-  };
-
-  SmallVector<const Use *, 16> Worklist;
-  for (const Use &U : AI.uses())
-    Worklist.push_back(&U);
-
-  while (!Worklist.empty()) {
-    const Use *U = Worklist.pop_back_val();
-    const User *Usr = U->getUser();
-
-    if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
-      if (!GEP->hasAllConstantIndices())
-        return false;
-      for (const Use &GU : GEP->uses())
-        Worklist.push_back(&GU);
-      continue;
-    }
-    if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
-      if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
-        return false;
-      continue;
-    }
-    if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
-      // The pointer must be the address operand, not a stored value (escape).
-      if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
-        return false;
-      if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
-                    SI->isSimple()))
-        return false;
-      continue;
-    }
-    // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
-    // otherwise not statically lowerable.
-    return false;
-  }
-  return true;
-}
-
-// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
-// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
-// falls back to ordinary scratch.
-static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
-  SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
-  for (Use *U : Uses) {
-    User *Usr = U->getUser();
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
-      IRBuilder<> B(GEP);
-      SmallVector<Value *, 4> Indices(GEP->indices());
-      Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
-                                  GEP->getName(), GEP->getNoWrapFlags());
-      rewriteVGPRPointerToScratch(GEP, NewGEP);
-      GEP->eraseFromParent();
-      continue;
-    }
-    if (auto *II = dyn_cast<IntrinsicInst>(Usr);
-        II && II->isLifetimeStartOrEnd()) {
-      II->eraseFromParent();
-      continue;
-    }
-    // Loads, stores, address-space casts and call arguments only need this
-    // operand repointed; their result types do not depend on the operand's
-    // address space.
-    U->set(New);
-  }
-}
-
-static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
-  auto *NewAI = new AllocaInst(
-      AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
-      AI->getAlign(), AI->getName(), AI->getIterator());
-  NewAI->setDebugLoc(AI->getDebugLoc());
-  rewriteVGPRPointerToScratch(AI, NewAI);
-  AI->eraseFromParent();
-}
-
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
-  assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
-  if (!IsLatePass && DisablePromoteAllocaToVector)
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
+  if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
     return false;
 
-  bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
-  bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
-
   bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
   MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM, F) : 128;
   setFunctionLimits(F);
 
-  // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
-  // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
-  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  const bool TargetSupportsVGPRAsMemory =
-      ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
-
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
@@ -529,18 +391,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
       LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
 
       AllocaAnalysis AA{AI};
-      if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
-        // Allocas that already live in the VGPR address space only need to be
-        // assigned VGPRs, which is required for functionality.
-        if (IsLatePass)
-          Allocas.push_back(std::move(AA));
-        continue;
-      }
-      if (!PromoteToVector && !PromoteToLDS)
-        continue;
       if (collectAllocaUses(AA)) {
-        if (PromoteToVector)
-          analyzePromoteToVector(AA);
+        analyzePromoteToVector(AA);
         if (PromoteToLDS)
           analyzePromoteToLDS(AA);
         if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -551,15 +403,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
     }
   }
 
-  stable_sort(Allocas, [](const auto &A, const auto &B) {
-    // Prioritize pre-existing VGPR allocas, since their allocation must not
-    // fail.
-    bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
-    bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
-    if (AIsVGPR != BIsVGPR)
-      return AIsVGPR;
-    return A.Score > B.Score;
-  });
+  stable_sort(Allocas,
+              [](const auto &A, const auto &B) { return A.Score > B.Score; });
 
   // clang-format off
   LLVM_DEBUG(
@@ -572,39 +417,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   bool Changed = false;
   SetVector<IntrinsicInst *> DeferredIntrs;
   for (AllocaAnalysis &AA : Allocas) {
-    if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
-      // Fall back to scratch (and warn) when the object can't be kept in
-      // registers, so the program still compiles correctly: either the target
-      // does not support "VGPR as memory", or the access pattern (dynamic
-      // index, sub-dword, escaping address) is not yet supported.
-      const char *Unsupported = nullptr;
-      if (!TargetSupportsVGPRAsMemory)
-        Unsupported = "not supported on this target";
-      else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
-        Unsupported = "dynamic indexing, sub-dword access, or escaping address "
-                      "is not yet supported";
-      if (Unsupported) {
-        F.getContext().diagnose(DiagnosticInfoUnsupported(
-            F,
-            Twine("'amdgpu_vgpr' object could not be kept in vector registers "
-                  "(") +
-                Unsupported + "); using scratch memory instead",
-            AA.Alloca->getDebugLoc(), DS_Warning));
-        demoteVGPRAllocaToScratch(AA.Alloca);
-        Changed = true;
-        continue;
-      }
-      const unsigned AllocaCost =
-          AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
-      allocateVgprs(AA);
-      // Account for the consumed VGPRs in the vectorization budget.
-      if (VectorizationBudget > AllocaCost)
-        VectorizationBudget -= AllocaCost;
-      else
-        VectorizationBudget = 0;
-      Changed = true;
-      continue;
-    }
     if (AA.Vector.Ty) {
       std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
       assert(Size); // Expected to succeed on non-array alloca.
@@ -639,21 +451,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   return Changed;
 }
 
-void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
-  LLVMContext &Ctx = Mod->getContext();
-  const unsigned AllocaSize =
-      DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
-
-  // Record where the object was allocated within the VGPR file.
-  Type *I32 = Type::getInt32Ty(Ctx);
-  AA.Alloca->setMetadata(
-      "amdgpu.allocated.vgprs",
-      MDNode::get(
-          Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
-                ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
-  AllocVGPROffset += alignTo(AllocaSize, 4);
-}
-
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index aab43f23cf606..f41474a2bc031 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -176,12 +176,24 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
     Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
                                           /*IncludeCalls=*/false);
 
+  // Reserved "VGPR as memory" file registers aren't "used" but must still be
+  // allocated, so the VGPR count has to cover the highest one.
+  std::pair<unsigned, unsigned> VGPRMemFile = TRI.getVGPRMemoryFile(MF);
+  unsigned VGPRMemBase = VGPRMemFile.first;
+  unsigned VGPRMemCount = VGPRMemFile.second;
+  auto AccountForVGPRMemoryFile = [&](int32_t NumVGPR) -> int32_t {
+    if (VGPRMemCount)
+      NumVGPR = std::max<int32_t>(NumVGPR, VGPRMemBase + VGPRMemCount);
+    return NumVGPR;
+  };
+
   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
     Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
                                           /*IncludeCalls=*/false);
+    Info.NumVGPR = AccountForVGPRMemoryFile(Info.NumVGPR);
     return Info;
   }
 
@@ -319,7 +331,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
     }
   }
 
-  Info.NumVGPR = MaxVGPR + 1;
+  Info.NumVGPR = AccountForVGPRMemoryFile(MaxVGPR + 1);
 
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7fc233be91fe0..d6d0c36721fdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -31,6 +31,7 @@
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUPreloadKernArgProlog.h"
 #include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPUPrivateObjectVGPRs.h"
 #include "AMDGPURemoveIncompatibleFunctions.h"
 #include "AMDGPUReserveWWMRegs.h"
 #include "AMDGPUResourceUsageAnalysis.h"
@@ -668,7 +669,8 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerSGPRSpillsLegacyPass(*PR);
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesLegacyPass(*PR);
-  initializeAMDGPUPrivateObjectVGPRsPass(*PR);
+  initializeAMDGPUPrivateObjectVGPRsLegacyPass(*PR);
+  initializeAMDGPULowerModuleVGPRsPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1492,6 +1494,11 @@ void AMDGPUPassConfig::addIRPasses() {
     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
   }
 
+  // Lay out "VGPR as memory" (addrspace(13)) globals into one shared register
+  // file and record its size/base on the participating functions, so it
+  // resolves to the same registers across a kernel's call graph.
+  addPass(createAMDGPULowerModuleVGPRsPass());
+
   // Run atomic optimizer before Atomic Expand
   if ((TM.getTargetTriple().isAMDGCN()) &&
       (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
@@ -1501,12 +1508,9 @@ void AMDGPUPassConfig::addIRPasses() {
 
   addPass(createAtomicExpandLegacyPass());
 
-  // With optimizations enabled, do the full promotion of allocas. Without
-  // optimizations, this only allocates pre-existing VGPR address space allocas,
-  // which is required for functionality.
-  addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
-
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    addPass(createAMDGPUPromoteAlloca());
+
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses();
 
@@ -1721,11 +1725,6 @@ void GCNPassConfig::addFastRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
-  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
-  // At O0 there is no register coalescer; anchor on TwoAddress, where
-  // LiveIntervals is already available.
-  insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
-
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
   TargetPassConfig::addFastRegAlloc();
@@ -1752,12 +1751,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
-  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
-  // This runs after the coalescer so it does not perturb the kill flags that
-  // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
-  // the LiveIntervals the register allocator consumes next.
-  insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
-
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
 
@@ -1909,6 +1902,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
+  // Lower "VGPR as memory" accesses into copies to/from the reserved VGPR file.
+  // Runs after register allocation (so the file's reserved registers are final)
+  // and before memory-aware post-RA passes (so the pseudos are no longer seen
+  // as memory operations).
+  addPass(&AMDGPUPrivateObjectVGPRsID);
   addPass(&SIFixVGPRCopiesID);
   if (getOptLevel() > CodeGenOptLevel::None)
     addPass(&SIOptimizeExecMaskingLegacyID);
@@ -2290,6 +2288,10 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
   if (EnableLowerModuleLDS)
     addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
 
+  // Lay out "VGPR as memory" (addrspace(13)) globals into a shared register
+  // file (see the legacy pipeline above for details).
+  addModulePass(AMDGPULowerModuleVGPRsPass(), PMW);
+
   // Run atomic optimizer before Atomic Expand
   if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
@@ -2298,15 +2300,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
 
   addFunctionPass(AtomicExpandPass(TM), PMW);
 
-  // With optimizations enabled, do the full promotion of allocas. Without
-  // optimizations, only allocate pre-existing VGPR address space allocas, which
-  // is required for functionality.
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
-  else
-    addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
-
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses(PMW);
 
@@ -2619,6 +2614,9 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
 }
 
 void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
+  // Lower "VGPR as memory" accesses into copies to/from the reserved VGPR file
+  // (see the legacy GCNPassConfig::addPostRegAlloc for ordering rationale).
+  addMachineFunctionPass(AMDGPUPrivateObjectVGPRsPass(), PMW);
   addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW);
   if (TM.getOptLevel() > CodeGenOptLevel::None)
     addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dd25ab71997d7..3ca9f5bcc9f9d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
+  AMDGPULowerModuleVGPRs.cpp
   AMDGPUPrepareAGPRAlloc.cpp
   AMDGPULowerExecSync.cpp
   AMDGPUSwLowerLDS.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index edb31d77fa510..5c768bcb54d6e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4228,6 +4228,22 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
+bool SITargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+  // GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses, so
+  // fall back to SelectionDAG (which does) for any instruction that produces or
+  // consumes such a pointer. TODO: implement the GlobalISel path.
+  auto IsVGPRPtr = [](const Value *V) {
+    Type *Ty = V->getType();
+    return Ty->isPointerTy() && Ty->getPointerAddressSpace() == AMDGPUAS::VGPR;
+  };
+  if (IsVGPRPtr(&Inst))
+    return true;
+  for (const Value *Op : Inst.operands())
+    if (IsVGPRPtr(Op))
+      return true;
+  return false;
+}
+
 namespace {
 // Chain calls have special arguments that we need to handle. These are
 // tagging along at the end of the arguments list(s), after the SGPR and VGPR
@@ -5229,11 +5245,16 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
       MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   Register CondReg = MRI.createVirtualRegister(BoolRC);
 
-  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
-      .addReg(InitReg)
-      .addMBB(&OrigBB)
-      .addReg(ResultReg)
-      .addMBB(&LoopBB);
+  // A zero PhiReg means the caller threads no per-iteration result value
+  // through the loop (e.g. a store whose destination is a fixed physical
+  // register), so the result PHI - and its requirement that ResultReg be
+  // live-out of the loop - is omitted.
+  if (PhiReg)
+    BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+        .addReg(InitReg)
+        .addMBB(&OrigBB)
+        .addReg(ResultReg)
+        .addMBB(&LoopBB);
 
   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
       .addReg(InitSaveExecReg)
@@ -5595,6 +5616,118 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   return LoopBB;
 }
 
+// Expand a runtime-index "VGPR as memory" access into an indirect movrel /
+// s_set_gpr_idx read/write of the reserved file (a waterfall loop if the index
+// is divergent), reusing the indirect-vector-element machinery.
+static MachineBasicBlock *emitVGPRFrameDynamic(MachineInstr &MI,
+                                               MachineBasicBlock &MBB,
+                                               const GCNSubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  const bool IsLoad = MI.getOpcode() == AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32;
+
+  auto [BaseIdx, Count] = TRI.getVGPRMemoryFile(*MF);
+  assert(Count && "dynamic VGPR-memory access without a reserved file");
+  const TargetRegisterClass *VecRC = TRI.getVGPRClassForBitWidth(Count * 32);
+  assert(VecRC && "dynamic VGPR-memory file has no tuple class; "
+                  "LowerLoadStoreVGPR rejects this before creating the pseudo");
+  unsigned VecBits = TRI.getRegSizeInBits(*VecRC);
+  // movrel reads name the base sub-register directly (a subregister index is
+  // not allowed on a physical-register operand), with the whole file tuple as
+  // an implicit use.
+  MCRegister FileReg = TRI.getMatchingSuperReg(
+      AMDGPU::VGPR_32RegClass.getRegister(BaseIdx), AMDGPU::sub0, VecRC);
+  MCRegister FileBaseReg = AMDGPU::VGPR_32RegClass.getRegister(BaseIdx);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
+
+  // Index is file-relative (the constant part was folded in at ISel): base
+  // sub0, no extra offset.
+  unsigned SubReg = AMDGPU::sub0;
+  int Offset = 0;
+
+  // Emit the indexed read/write at InsPt: GPR-idx mode uses IdxReg, movrel mode
+  // uses the preset m0 (IdxReg then unused).
+  auto EmitAccess = [&](MachineBasicBlock &BB,
+                        MachineBasicBlock::iterator InsPt, Register IdxReg) {
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      if (UseGPRIdxMode)
+        BuildMI(BB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, true), Dst)
+            .addReg(FileReg)
+            .addReg(IdxReg)
+            .addImm(SubReg);
+      else
+        BuildMI(BB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(FileBaseReg)
+            .addReg(FileReg, RegState::Implicit);
+    } else {
+      const MachineOperand *Val =
+          TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+      if (UseGPRIdxMode)
+        BuildMI(BB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, false),
+                FileReg)
+            .addReg(FileReg)
+            .add(*Val)
+            .addReg(IdxReg)
+            .addImm(SubReg);
+      else
+        BuildMI(BB, InsPt, DL,
+                TII->getIndirectRegWriteMovRelPseudo(VecBits, 32, false),
+                FileReg)
+            .addReg(FileReg)
+            .add(*Val)
+            .addImm(SubReg);
+    }
+  };
+
+  MachineBasicBlock::iterator I(&MI);
+
+  // Uniform (scalar) index: set up the index in place and emit the access.
+  if (TRI.isSGPRClass(IdxRC)) {
+    Register IdxReg;
+    if (UseGPRIdxMode)
+      IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+    else
+      setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+    EmitAccess(MBB, I, IdxReg);
+    MI.eraseFromParent();
+    return &MBB;
+  }
+
+  // Divergent (per-lane) index: a waterfall loop covers the lanes sharing each
+  // index. The file is in fixed (reserved) physical registers, so unlike
+  // indirect vector access it is not threaded through a PHI - the per-lane
+  // access reads/writes it in place under EXEC - and a stored value must stay
+  // live across the back-edge.
+  if (!IsLoad)
+    MRI.clearKillFlags(
+        TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg());
+
+  // A load threads its result through the loop; a store threads nothing
+  // (PhiReg == 0 skips the result PHI).
+  Register PhiReg, InitReg;
+  if (IsLoad) {
+    PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+  }
+
+  Register SGPRIdxReg;
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+                              UseGPRIdxMode, SGPRIdxReg);
+  MachineBasicBlock *LoopBB = InsPt->getParent();
+  EmitAccess(*LoopBB, InsPt, SGPRIdxReg);
+
+  MI.eraseFromParent();
+  return LoopBB;
+}
+
 static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,
                                                       MachineBasicBlock *BB) {
   // For targets older than GFX12, we emit a sequence of 32-bit operations.
@@ -7122,6 +7255,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AMDGPU::SI_INDIRECT_DST_V16:
   case AMDGPU::SI_INDIRECT_DST_V32:
     return emitIndirectDst(MI, *BB, *getSubtarget());
+  case AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32:
+  case AMDGPU::SI_VGPR_FRAME_DYN_STORE_B32:
+    return emitVGPRFrameDynamic(MI, *BB, *getSubtarget());
   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
   case AMDGPU::SI_KILL_I1_PSEUDO:
     return splitKillBlock(MI, BB);
@@ -9925,6 +10061,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
 }
 
+// Byte offset of an addrspace(13) global in its file (metadata from
+// AMDGPULowerModuleVGPRs), or nullopt if it was not laid out.
+static std::optional<uint64_t> getVGPRMemoryOffset(const GlobalVariable *GV) {
+  if (MDNode *MD = GV->getMetadata("amdgpu.vgpr.memory.offset"))
+    return mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+  return std::nullopt;
+}
+
 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
                                              SDValue Op,
                                              SelectionDAG &DAG) const {
@@ -9933,6 +10077,30 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
   EVT PtrVT = Op.getValueType();
 
   const GlobalValue *GV = GSD->getGlobal();
+
+  // A "VGPR as memory" (addrspace(13)) global has no numeric address; its
+  // "address" is the object's byte offset in the file. Lower it to that
+  // constant so even a standalone materialization (e.g. a constexpr GEP) never
+  // reaches the pc-relative global-address sequence.
+  if (GSD->getAddressSpace() == AMDGPUAS::VGPR) {
+    // The object (resolving aliases) must be a global variable laid out by
+    // AMDGPULowerModuleVGPRs. Diagnose a missing layout here too: this fold runs
+    // before LowerLoadStoreVGPR's own check, so a folded constant base would
+    // otherwise bypass it and silently resolve to offset 0.
+    const auto *GVar = dyn_cast<GlobalVariable>(GV->getAliaseeObject());
+    std::optional<uint64_t> MDOffset =
+        GVar ? getVGPRMemoryOffset(GVar) : std::nullopt;
+    if (!MDOffset) {
+      DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+          DAG.getMachineFunction().getFunction(),
+          "unsupported 'VGPR as memory' access: missing "
+          "amdgpu.vgpr.memory.offset layout metadata",
+          DL.getDebugLoc()));
+      return DAG.getPOISON(PtrVT);
+    }
+    return DAG.getConstant(GSD->getOffset() + *MDOffset, DL, PtrVT);
+  }
+
   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
        shouldUseLDSConstAddress(GV)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
@@ -14332,6 +14500,248 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
 }
 
+/// Lower a load/store of a "VGPR as memory" object (a global in AMDGPUAS::VGPR)
+/// into an AMDGPUISD::REG_{LOAD,STORE} node carrying the dword index of the
+/// access within the reserved VGPR file. A constant index selects the
+/// SI_VGPR_FRAME_* pseudos (rewritten to register copies by
+/// AMDGPUPrivateObjectVGPRs); a runtime index selects the SI_VGPR_FRAME_DYN_*
+/// pseudos (expanded to an indexed register move). Sub-dword (i8/i16) accesses
+/// are realized as a read-modify-write of the containing dword.
+///
+/// An access this routine cannot handle (e.g. a wider-than-dword dynamic
+/// access, or a base with no layout metadata) is diagnosed and replaced with
+/// poison (load) / its incoming chain (store), so it never reaches instruction
+/// selection as an unselectable memory operation.
+SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MemSDNode *MemOp = cast<MemSDNode>(Op);
+  SDLoc DL(Op);
+
+  // Emit the diagnosis described above (poison for a load, the incoming chain
+  // for a store).
+  auto Unsupported = [&](const Twine &Reason) -> SDValue {
+    DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+        DAG.getMachineFunction().getFunction(),
+        "unsupported 'VGPR as memory' access: " + Reason, DL.getDebugLoc()));
+    if (isa<StoreSDNode>(MemOp))
+      return MemOp->getChain();
+    return DAG.getMergeValues(
+        {DAG.getPOISON(MemOp->getValueType(0)), MemOp->getChain()}, DL);
+  };
+
+  // The pointer is a byte offset into the file. After stripping a folded GEP
+  // offset, the base is the addrspace(13) global (offset in metadata), the
+  // constant LowerGlobalAddress folds it to, or a runtime value (dynamic).
+  SDValue Ptr = MemOp->getBasePtr();
+  // Accumulate the byte offset in 64 bits: the addrspace(13) pointer is only
+  // 32-bit, so a folded constant such as a negative GEP index would otherwise
+  // wrap and defeat the out-of-range check below.
+  uint64_t ExtraOffset = 0;
+  SDValue DynByteOffset; // non-constant byte offset, for a runtime index
+  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1)))
+      ExtraOffset = C->getZExtValue();
+    else
+      DynByteOffset = Ptr.getOperand(1);
+    Ptr = Ptr.getOperand(0);
+  }
+
+  uint64_t ByteOffset = ExtraOffset;
+  if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
+    if (GA->getAddressSpace() != AMDGPUAS::VGPR)
+      return Unsupported(
+          "base is a global outside the VGPR address space (13)");
+    const auto *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+    if (!GV)
+      return Unsupported(
+          "base is not a VGPR address space (13) global variable");
+    std::optional<uint64_t> MDOffset = getVGPRMemoryOffset(GV);
+    if (!MDOffset)
+      return Unsupported("missing amdgpu.vgpr.memory.offset layout metadata");
+    ByteOffset += *MDOffset + GA->getOffset();
+  } else if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
+    ByteOffset += C->getZExtValue();
+  } else {
+    if (DynByteOffset)
+      return Unsupported("two independent dynamic address terms");
+    DynByteOffset = Ptr; // the base is itself a runtime byte offset
+  }
+  EVT MemVT = MemOp->getMemoryVT();
+  unsigned BitWidth = MemVT.getSizeInBits();
+  MachineFunction &MFn = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *MFI = MFn.getInfo<SIMachineFunctionInfo>();
+  unsigned FileBytes = MFI->getVGPRMemorySize();
+  SDValue Chain = MemOp->getChain();
+
+  auto GetDwordMMO = [&](MachineMemOperand::Flags F) {
+    return MFn.getMachineMemOperand(MemOp->getPointerInfo(), F, /*Size=*/4,
+                                    Align(4));
+  };
+
+  // Lower a sub-dword (8/16-bit) access at dword Index, with the field starting
+  // at bit BitInDword, as a read-modify-write (store) or extract (load) of the
+  // containing dword. Index and BitInDword may be constants - which fold, so
+  // this serves both the constant- and runtime-index paths.
+  auto EmitSubDword = [&](SDValue Index, SDValue BitInDword) -> SDValue {
+    SDValue LowMaskC =
+        DAG.getConstant(maskTrailingOnes<uint32_t>(BitWidth), DL, MVT::i32);
+    SDValue Old = DAG.getMemIntrinsicNode(
+        AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+        {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+    if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+      SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+      Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val, LowMaskC);
+      Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val, BitInDword);
+      SDValue MaskShifted =
+          DAG.getNode(ISD::SHL, DL, MVT::i32, LowMaskC, BitInDword);
+      SDValue Cleared = DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+                                    DAG.getNOT(DL, MaskShifted, MVT::i32));
+      SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+                                     DAG.getVTList(MVT::Other),
+                                     {Old.getValue(1), New, Index}, MVT::i32,
+                                     GetDwordMMO(MachineMemOperand::MOStore));
+    }
+    auto *LoadOp = cast<LoadSDNode>(MemOp);
+    bool IsSExt = LoadOp->getExtensionType() == ISD::SEXTLOAD;
+    SDValue Field = DAG.getNode(ISD::SRL, DL, MVT::i32, Old, BitInDword);
+    if (IsSExt)
+      Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+                          DAG.getValueType(MemVT));
+    else
+      Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field, LowMaskC);
+    // Narrow/extend i32 Field to the result type per the load's extension kind.
+    EVT ResVT = LoadOp->getValueType(0);
+    SDValue Result = IsSExt ? DAG.getSExtOrTrunc(Field, DL, ResVT)
+                            : DAG.getZExtOrTrunc(Field, DL, ResVT);
+    return DAG.getMergeValues({Result, Old.getValue(1)}, DL);
+  };
+
+  // Runtime index. Sub-dword (8/16-bit) accesses RMW the containing dword
+  // (race-free: VGPRs are per-lane).
+  if (DynByteOffset) {
+    if (BitWidth != 8 && BitWidth != 16 && BitWidth != 32)
+      return Unsupported("dynamic index wider than 32 bits");
+    if (!FileBytes)
+      return Unsupported("dynamic access to an empty VGPR-memory file");
+    // The dynamic index move treats the whole file as one indexed tuple, so the
+    // file's (even-dword-rounded) size must have a VGPR tuple class.
+    unsigned FileDwords = divideCeil(FileBytes, 4u);
+    if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(
+            AMDGPU::getVGPRMemoryFileDwords(FileBytes) * 32))
+      return Unsupported("VGPR-memory file too large for a dynamic index");
+    // The address is a 32-bit addrspace(13) pointer, so the byte offset is
+    // computed in i32: any wrap is the defined behavior of that pointer width,
+    // and the UMIN clamp below bounds the resulting dword index into the file
+    // regardless. (The constant-index path uses 64-bit arithmetic instead,
+    // because it must statically range-check rather than clamp.)
+    SDValue DynI32 = DAG.getZExtOrTrunc(DynByteOffset, DL, MVT::i32);
+    SDValue Bytes = DAG.getNode(ISD::ADD, DL, MVT::i32, DynI32,
+                                DAG.getConstant(ByteOffset, DL, MVT::i32));
+    SDValue Index = DAG.getNode(ISD::SRL, DL, MVT::i32, Bytes,
+                                DAG.getConstant(2, DL, MVT::i32));
+
+    // Clamp the dword index into the file so an out-of-range dynamic access
+    // disturbs only the file's own last register, not arbitrary live VGPRs.
+    Index = DAG.getNode(ISD::UMIN, DL, MVT::i32, Index,
+                        DAG.getConstant(FileDwords - 1, DL, MVT::i32));
+
+    if (BitWidth == 8 || BitWidth == 16) {
+      // The RMW assumes the field stays within one dword, which holds only for
+      // a naturally aligned access; an underaligned one could cross a boundary
+      // at runtime, so reject it rather than silently drop the high bits.
+      if (MemOp->getAlign() < Align(BitWidth / 8))
+        return Unsupported("underaligned sub-dword dynamic access");
+
+      SDValue ByteInDword = DAG.getNode(ISD::AND, DL, MVT::i32, Bytes,
+                                        DAG.getConstant(3, DL, MVT::i32));
+      SDValue BitInDword = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteInDword,
+                                       DAG.getConstant(3, DL, MVT::i32));
+      return EmitSubDword(Index, BitInDword);
+    }
+
+    // Whole-dword dynamic access: both the constant and runtime parts must be
+    // dword-aligned so the index shift does not silently round down.
+    if (ByteOffset % 4 != 0 || MemOp->getAlign() < Align(4))
+      return Unsupported("misaligned 32-bit dynamic access");
+    if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+      SDValue Val = DAG.getBitcast(MVT::i32, StoreOp->getValue());
+      return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+                                     DAG.getVTList(MVT::Other),
+                                     {Chain, Val, Index}, MVT::i32,
+                                     GetDwordMMO(MachineMemOperand::MOStore));
+    }
+    auto *LoadOp = cast<LoadSDNode>(MemOp);
+    if (LoadOp->getExtensionType() != ISD::NON_EXTLOAD)
+      return Unsupported("extending 32-bit dynamic load");
+    SDValue Ld = DAG.getMemIntrinsicNode(
+        AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+        {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+    EVT ResVT = LoadOp->getValueType(0);
+    SDValue Res = ResVT == MVT::i32 ? Ld : DAG.getBitcast(ResVT, Ld);
+    return DAG.getMergeValues({Res, Ld.getValue(1)}, DL);
+  }
+
+  // A statically out-of-range constant index would select physical registers
+  // outside the reserved file. It is undefined behavior; diagnose it rather
+  // than miscompile into a copy to/from an arbitrary live VGPR.
+  if (ByteOffset + BitWidth / 8 > FileBytes)
+    return Unsupported("constant index out of range");
+
+  // Sub-dword (8/16-bit) constant-index access. Registers have no sub-dword
+  // addressing, so extract from (load) or RMW (store) the containing dword.
+  if (BitWidth == 8 || BitWidth == 16) {
+    unsigned BitInDword = (ByteOffset % 4) * 8;
+    if (BitInDword + BitWidth > 32)
+      return Unsupported("sub-dword field crosses a dword boundary");
+    return EmitSubDword(DAG.getConstant(ByteOffset / 4, DL, MVT::i32),
+                        DAG.getConstant(BitInDword, DL, MVT::i32));
+  }
+
+  // Whole-dword accesses.
+  if (ByteOffset % 4 != 0)
+    return Unsupported("misaligned multi-dword access");
+  if (BitWidth == 0 || BitWidth % 32 != 0)
+    return Unsupported("access is not a whole number of dwords");
+  if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
+    return Unsupported("access wider than the largest VGPR tuple");
+
+  if (auto *Load = dyn_cast<LoadSDNode>(MemOp)) {
+    if (Load->getExtensionType() != ISD::NON_EXTLOAD)
+      return Unsupported("extending multi-dword load");
+  } else if (cast<StoreSDNode>(MemOp)->isTruncatingStore()) {
+    return Unsupported("truncating multi-dword store");
+  }
+
+  // View the access as i32 / <N x i32> so one node covers it; bitcast when the
+  // memory type is not register legal.
+  EVT RegVT = MemVT;
+  if (!isTypeLegal(RegVT)) {
+    unsigned NumDwords = BitWidth / 32;
+    RegVT = NumDwords == 1
+                ? EVT(MVT::i32)
+                : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumDwords);
+  }
+
+  SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+  if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+    SDValue Value = StoreOp->getValue();
+    if (RegVT != MemVT)
+      Value = DAG.getNode(ISD::BITCAST, DL, RegVT, Value);
+    return DAG.getMemIntrinsicNode(
+        AMDGPUISD::REG_STORE, DL, DAG.getVTList(MVT::Other),
+        {Chain, Value, Index}, MemVT, StoreOp->getMemOperand());
+  }
+
+  SDValue NewLoad = DAG.getMemIntrinsicNode(
+      AMDGPUISD::REG_LOAD, DL, DAG.getVTList(RegVT, MVT::Other), {Chain, Index},
+      MemVT, MemOp->getMemOperand());
+  if (RegVT == MemVT)
+    return NewLoad;
+  SDValue Value = DAG.getNode(ISD::BITCAST, DL, MemVT, NewLoad);
+  return DAG.getMergeValues({Value, NewLoad.getValue(1)}, DL);
+}
+
 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
 /// by the chain and intrinsic ID. Theoretically we would also need to check the
 /// specific intrinsic, but they all place the pointer operand first.
@@ -18569,6 +18979,19 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  // Lower "VGPR as memory" (addrspace(13)) accesses into AMDGPUISD::REG_*. This
+  // is mandatory lowering, but it is done here rather than in LowerOperation
+  // because it must apply to a load/store of *any* value type (including legal
+  // scalars like i32, which are never custom-lowered), and the address space
+  // cannot be expressed in setOperationAction. It is scoped to addrspace(13)
+  // nodes (so ordinary memory is untouched) and runs first in PerformDAGCombine
+  // and replaces the node, so no other combine preempts it.
+  unsigned Opc = N->getOpcode();
+  if ((Opc == ISD::LOAD || Opc == ISD::STORE) &&
+      cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::VGPR)
+    if (SDValue V = LowerLoadStoreVGPR(SDValue(N, 0), DCI.DAG))
+      return V;
+
   switch (N->getOpcode()) {
   case ISD::ABS:
     if (SDValue Res = promoteUniformUnaryOpToI32(SDValue(N, 0), DCI))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c98426cdac0b1..aa1b11e3c4c68 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -125,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLoadStoreVGPR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const;
@@ -453,6 +454,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool mayBeEmittedAsTailCall(const CallInst *) const override;
 
+  bool fallBackToDAGISel(const Instruction &Inst) const override;
+
   bool isEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c30e53e9b4e4..35303c881955c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -59,6 +59,17 @@ def GFX10Gen         : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.G
 // modifier behavior with dx10_enable.
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
+// "VGPR as memory" (addrspace(13)) load/store carrying a dword index into the
+// reserved VGPR file; selected into the SI_VGPR_FRAME_* pseudos.
+def SDTRegIdxLoad : SDTypeProfile<1, 1,
+    [SDTCisVT<1, i32>]>; // dword_index
+def SDTRegIdxStore : SDTypeProfile<0, 2,
+    [SDTCisVT<1, i32>]>; // data, dword_index
+def SIreg_load : SDNode<"AMDGPUISD::REG_LOAD", SDTRegIdxLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def SIreg_store : SDNode<"AMDGPUISD::REG_STORE", SDTRegIdxStore,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 def SDTSBufferLoad : SDTypeProfile<1, 3,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3594caef86782..a8877ab258a8e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,24 +1243,65 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
 }
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
-// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
-// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
-// within the per-function VGPR file. They are produced during instruction
-// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
-// pass before register allocation.
+// "VGPR as memory" (addrspace(13)) accesses: load/store of a VGPR tuple at a
+// constant dword index in the reserved file. Selected from AMDGPUISD::REG_*
+// (constant index) and rewritten to copies by AMDGPUPrivateObjectVGPRs.
 let hasSideEffects = 0 in {
-def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
-                                        (ins i32imm:$offset)> {
-  let mayLoad = 1;
-  let mayStore = 0;
+foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
+              VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
+              VReg_512, VReg_1024] in {
+  def SI_VGPR_FRAME_LOAD_B#rc.Size : VPseudoInstSI <
+      (outs rc:$vdst), (ins i32imm:$idx)> {
+    let mayLoad = 1;
+    let mayStore = 0;
+  }
+  def SI_VGPR_FRAME_STORE_B#rc.Size : VPseudoInstSI <
+      (outs), (ins rc:$vdata, i32imm:$idx)> {
+    let mayLoad = 0;
+    let mayStore = 1;
+  }
 }
+} // End hasSideEffects = 0
 
-def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
-                                         (ins VGPR_32:$vdata, i32imm:$offset)> {
-  let mayLoad = 0;
-  let mayStore = 1;
+// Same, at a *runtime* dword index ($idx, a VS_32). The custom inserter expands
+// these into an indirect read/write of the file (movrel / s_set_gpr_idx, with a
+// waterfall loop for a divergent index). Only 32-bit accesses for now.
+let usesCustomInserter = 1, hasSideEffects = 0, UseNamedOperandTable = 1 in {
+  def SI_VGPR_FRAME_DYN_LOAD_B32 : VPseudoInstSI <
+      (outs VGPR_32:$vdst), (ins VS_32:$idx)> {
+    let mayLoad = 1;
+    let mayStore = 0;
+  }
+  def SI_VGPR_FRAME_DYN_STORE_B32 : VPseudoInstSI <
+      (outs), (ins VGPR_32:$vdata, VS_32:$idx)> {
+    let mayLoad = 0;
+    let mayStore = 1;
+  }
 }
-} // End hasSideEffects = 0
+
+// Constant dword index -> width-matched frame pseudo.
+multiclass VGPRFrameLoadStorePat<ValueType vt> {
+  defvar load_inst = !cast<Instruction>("SI_VGPR_FRAME_LOAD_B"#vt.Size);
+  defvar store_inst = !cast<Instruction>("SI_VGPR_FRAME_STORE_B"#vt.Size);
+  def : GCNPat<(vt (SIreg_load (i32 imm:$idx))), (load_inst imm:$idx)>;
+  def : GCNPat<(SIreg_store vt:$data, (i32 imm:$idx)),
+               (store_inst $data, imm:$idx)>;
+}
+
+// Non-constant dword index -> dynamic pseudo. Lower complexity than the
+// constant patterns, so a constant index still prefers them.
+foreach vt = Reg32Types.types in {
+  def : GCNPat<(vt (SIreg_load i32:$idx)), (SI_VGPR_FRAME_DYN_LOAD_B32 $idx)>;
+  def : GCNPat<(SIreg_store vt:$data, i32:$idx),
+               (SI_VGPR_FRAME_DYN_STORE_B32 $data, $idx)>;
+}
+
+foreach vt = !listconcat(
+    Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
+    Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
+    Reg288Types.types, Reg320Types.types, Reg352Types.types, Reg384Types.types,
+    Reg512Types.types, Reg1024Types.types) in
+defm : VGPRFrameLoadStorePat<vt>;
 
 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 // needs to be used and an extra instruction to move between VGPR and AGPR.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4be4ce28e6de5..13648e813488d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -183,6 +183,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
       "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
 
+  // "VGPR as memory" file layout from AMDGPULowerModuleVGPRs (~0u base = none).
+  VGPRMemorySize =
+      F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-size", 0);
+  VGPRMemoryBase =
+      F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-base", ~0u);
+
   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
   // VGPR available at all times. For now, reserve highest available VGPR. After
   // RA, shift it to the lowest available unused VGPR if the one exist.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 1f43505650222..4d799da91c7f0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -587,6 +587,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
   // the serialization easier.
   ReservedRegSet WWMReservedRegs;
 
+  // "VGPR as memory" (addrspace(13)) file assigned by AMDGPULowerModuleVGPRs:
+  // size in bytes and the shared base register index (~0u = none). Reserved out
+  // of allocation for the whole function, like LDS; offsets come from metadata.
+  unsigned VGPRMemorySize = 0;
+  unsigned VGPRMemoryBase = ~0u;
+
   bool IsWholeWaveFunction = false;
 
   using PrologEpilogSGPRSpill =
@@ -690,6 +696,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
 
+  // "VGPR as memory" file size in bytes (0 if none) and shared base register.
+  unsigned getVGPRMemorySize() const { return VGPRMemorySize; }
+  unsigned getVGPRMemoryBase() const { return VGPRMemoryBase; }
+
   bool isWWMReservedRegister(Register Reg) const {
     return WWMReservedRegs.contains(Reg);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9700720f0373a..94098c8d863ba 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -584,6 +584,45 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
 }
 
+std::pair<unsigned, unsigned>
+SIRegisterInfo::getVGPRMemoryFile(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned Bytes = MFI->getVGPRMemorySize();
+  if (!Bytes)
+    return {0, 0};
+
+  // Even number of dwords so wide (>=64-bit) tuple accesses start on an aligned
+  // register on targets that require aligned VGPR tuples.
+  unsigned Dwords = AMDGPU::getVGPRMemoryFileDwords(Bytes);
+
+  // The base is assigned module-wide by AMDGPULowerModuleVGPRs (identical
+  // across the call graph, so an address resolves to the same register
+  // everywhere).
+  unsigned BaseIdx = MFI->getVGPRMemoryBase();
+  assert(BaseIdx != ~0u && "VGPR-memory size set without a base");
+
+  // The file [BaseIdx, BaseIdx + Dwords) must not overlap any VGPR ABI input.
+  // A small file sits below the work-item-ID register; a larger one is placed
+  // above it by the module pass. Verify no overlap remains rather than risk
+  // silently clobbering an input.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const auto &LI : MRI.liveins()) {
+    MCRegister Reg = LI.first;
+    const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+    if (!RC || !isVGPRClass(RC))
+      continue;
+    unsigned Start = getHWRegIndex(Reg);
+    unsigned End = Start + getRegSizeInBits(*RC) / 32u;
+    if (BaseIdx < End && Start < BaseIdx + Dwords)
+      report_fatal_error("VGPR-as-memory file overlaps a VGPR ABI input");
+  }
+
+  assert(BaseIdx + Dwords <=
+             ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) &&
+         "VGPR-as-memory file does not fit");
+  return {BaseIdx, Dwords};
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::MODE);
@@ -747,6 +786,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (Register Reg : MFI->getWWMReservedRegs())
     reserveRegisterTuples(Reserved, Reg);
 
+  // Reserve the registers backing "VGPR as memory" (addrspace(13)) objects
+  // (see getVGPRMemoryFile).
+  auto [VGPRMemBase, VGPRMemCount] = getVGPRMemoryFile(MF);
+  for (unsigned I = 0; I != VGPRMemCount; ++I)
+    reserveRegisterTuples(Reserved,
+                          AMDGPU::VGPR_32RegClass.getRegister(VGPRMemBase + I));
+
   // FIXME: Stop using reserved registers for this.
   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
     reserveRegisterTuples(Reserved, Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..1d7fcee791af1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -96,6 +96,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   bool isAsmClobberable(const MachineFunction &MF,
                         MCRegister PhysReg) const override;
 
+  /// The "VGPR as memory" (addrspace(13)) register file: a fixed block of
+  /// physical VGPRs reserved for the whole function, placed (like LDS) at a
+  /// location consistent across the call graph. Returns the VGPR_32 index of
+  /// the first file register and the dword register count, or {0, 0} if the
+  /// function has no such objects.
+  std::pair<unsigned, unsigned>
+  getVGPRMemoryFile(const MachineFunction &MF) const;
+
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7528cd2a009a3..96571dd028b14 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,7 +18,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1780,17 +1779,6 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
   return false;
 }
 
-AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
-  const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
-  assert(MD && MD->getNumOperands() == 2 &&
-         "expected !amdgpu.allocated.vgprs metadata with 2 operands");
-  unsigned Address =
-      mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
-  unsigned Size =
-      mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
-  return {Address, Size};
-}
-
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 923c5c3a988fd..d74cc3da3593e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -31,7 +31,6 @@ struct amd_kernel_code_t;
 namespace llvm {
 
 struct Align;
-class AllocaInst;
 class Argument;
 class Function;
 class GlobalValue;
@@ -50,6 +49,14 @@ namespace AMDGPU {
 struct AMDGPUMCKernelCodeT;
 struct IsaVersion;
 
+/// Number of (even-aligned) dword registers a "VGPR as memory" (addrspace(13))
+/// file of \p Bytes bytes occupies. Single-sources the size shared by the
+/// module layout pass, register reservation, and instruction selection.
+inline unsigned getVGPRMemoryFileDwords(unsigned Bytes) {
+  unsigned Dwords = (Bytes + 3u) / 4u; // divideCeil(Bytes, 4)
+  return (Dwords + 1u) & ~1u;          // alignTo(Dwords, 2)
+}
+
 /// Generic target versions emitted by this version of LLVM.
 ///
 /// These numbers are incremented every time a codegen breaking change occurs
@@ -1038,16 +1045,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
-/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
-/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
-/// to within the VGPR file, and its size in bytes.
-struct AllocatedVGPRsMetadata {
-  unsigned Address;
-  unsigned Size;
-
-  static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
-};
-
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.
 
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..67365cdc38b88 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -273,8 +273,10 @@ static std::string computeAMDDataLayout(const Triple &TT) {
   // (address space 7), and 128-bit non-integral buffer resourcees (address
   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
   // like getelementptr.
+  // Address space 13 ("VGPR as memory") uses 32-bit register-relative indices.
   return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
+         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:"
+         "64-"
          "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
          "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
deleted file mode 100644
index f6c64c5121867..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
-
-define void @vgpr_alloca() {
-; CHECK-LABEL: define void @vgpr_alloca(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  ret void
-}
-
-define void @vgpr_alloca_multiple() {
-; CHECK-LABEL: define void @vgpr_alloca_multiple(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
-; CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[B]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca i32, align 4, addrspace(13)
-  %b = alloca [2 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  store i32 0, ptr addrspace(13) %b
-  ret void
-}
-
-define void @private_alloca_unchanged() {
-; CHECK-LABEL: define void @private_alloca_unchanged(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:    store i64 42, ptr addrspace(5) [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i64], align 4, addrspace(5)
-  store i64 42, ptr addrspace(5) %a
-  ret void
-}
-
-declare void @use(ptr)
-
-; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
-; back to ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
-; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
-; CHECK-NEXT:    store i32 [[V]], ptr addrspace(5) [[P2]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
-  store i32 %v, ptr addrspace(13) %p
-  ret void
-}
-
-; A VGPR object whose address escapes (here via a cast to a generic pointer, as
-; the frontend emits) cannot be kept in registers yet, so it falls back to
-; ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_escaping() {
-; CHECK-LABEL: define void @vgpr_alloca_escaping(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
-; CHECK-NEXT:    call void @use(ptr [[CAST]])
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %cast = addrspacecast ptr addrspace(13) %a to ptr
-  call void @use(ptr %cast)
-  ret void
-}
-
-; Whole-dword-multiple accesses (here i64) stay in VGPRs.
-define void @vgpr_alloca_i64(i64 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_i64(
-; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
-; CHECK-NEXT:    store i64 [[V]], ptr addrspace(13) [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a = alloca i64, align 8, addrspace(13)
-  store i64 %v, ptr addrspace(13) %a
-  ret void
-}
-
-; Sub-dword accesses are not supported yet, so the object falls back to scratch.
-define void @vgpr_alloca_subdword(i16 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_subdword(
-; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
-; CHECK-NEXT:    store i16 [[V]], ptr addrspace(5) [[A1]], align 2
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [2 x i16], align 4, addrspace(13)
-  store i16 %v, ptr addrspace(13) %a
-  ret void
-}
-;.
-; CHECK: [[META0]] = !{i32 0, i32 16}
-; CHECK: [[META1]] = !{i32 0, i32 4}
-; CHECK: [[META2]] = !{i32 4, i32 8}
-; CHECK: [[META3]] = !{i32 0, i32 8}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
deleted file mode 100644
index 63ba44b479279..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
-; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
-; scratch.
-
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-
-define void @vgpr_obj() {
-; SUPP:   alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
-; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
deleted file mode 100644
index ea914907a900d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
-
-; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
-; indices must lower to register copies, never to scratch/buffer memory traffic.
-
-; CHECK-LABEL: store_load_i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_i32(i32 %v) {
-  %a = alloca i32, align 4, addrspace(13)
-  store i32 %v, ptr addrspace(13) %a
-  %l = load i32, ptr addrspace(13) %a
-  %r = add i32 %l, 1
-  ret i32 %r
-}
-
-; CHECK-LABEL: store_load_array:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_array(i32 %v) {
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
-  %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
-  store i32 %v, ptr addrspace(13) %p1
-  store i32 7, ptr addrspace(13) %p3
-  %l1 = load i32, ptr addrspace(13) %p1
-  %l3 = load i32, ptr addrspace(13) %p3
-  %s = add i32 %l1, %l3
-  ret i32 %s
-}
-
-; A 64-bit (two-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_i64:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i64 @store_load_i64(i64 %v) {
-  %a = alloca i64, align 8, addrspace(13)
-  store i64 %v, ptr addrspace(13) %a
-  %l = load i64, ptr addrspace(13) %a
-  %r = add i64 %l, 1
-  ret i64 %r
-}
-
-; A vector (four-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_v4i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
-  %a = alloca <4 x i32>, align 16, addrspace(13)
-  store <4 x i32> %v, ptr addrspace(13) %a
-  %l = load <4 x i32>, ptr addrspace(13) %a
-  ret <4 x i32> %l
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 0dbabd2991bc4..13db81d89e43d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -27,9 +27,9 @@
 ; GCN-O0-NEXT: amdgpu-lower-exec-sync
 ; GCN-O0-NEXT: amdgpu-sw-lower-lds
 ; GCN-O0-NEXT: amdgpu-lower-module-lds
+; GCN-O0-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O0-NEXT: function
 ; GCN-O0-NEXT:   atomic-expand
-; GCN-O0-NEXT:   amdgpu-vgpr-allocate
 ; GCN-O0-NEXT:   verify
 ; GCN-O0-NEXT:   unreachableblockelim
 ; GCN-O0-NEXT:   ee-instrument<post-inline>
@@ -81,6 +81,7 @@
 ; GCN-O0-NEXT:       si-lower-wwm-copies
 ; GCN-O0-NEXT:       amdgpu-reserve-wwm-regs
 ; GCN-O0-NEXT:       regallocfast<filter=vgpr>
+; GCN-O0-NEXT:       amdgpu-private-object-vgprs
 ; GCN-O0-NEXT:       si-fix-vgpr-copies
 ; GCN-O0-NEXT:       remove-redundant-debug-values
 ; GCN-O0-NEXT:       fixup-statepoint-caller-saved
@@ -129,6 +130,7 @@
 ; GCN-O2-NEXT: amdgpu-lower-exec-sync
 ; GCN-O2-NEXT: amdgpu-sw-lower-lds
 ; GCN-O2-NEXT: amdgpu-lower-module-lds
+; GCN-O2-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O2-NEXT: function
 ; GCN-O2-NEXT:   amdgpu-atomic-optimizer
 ; GCN-O2-NEXT:   atomic-expand
@@ -253,6 +255,7 @@
 ; GCN-O2-NEXT:       stack-slot-coloring
 ; GCN-O2-NEXT:       machine-cp
 ; GCN-O2-NEXT:       machinelicm
+; GCN-O2-NEXT:       amdgpu-private-object-vgprs
 ; GCN-O2-NEXT:       si-fix-vgpr-copies
 ; GCN-O2-NEXT:       si-optimize-exec-masking
 ; GCN-O2-NEXT:       remove-redundant-debug-values
@@ -315,6 +318,7 @@
 ; GCN-O3-NEXT: amdgpu-lower-exec-sync
 ; GCN-O3-NEXT: amdgpu-sw-lower-lds
 ; GCN-O3-NEXT: amdgpu-lower-module-lds
+; GCN-O3-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O3-NEXT: function
 ; GCN-O3-NEXT:   amdgpu-atomic-optimizer
 ; GCN-O3-NEXT:   atomic-expand
@@ -439,6 +443,7 @@
 ; GCN-O3-NEXT:       stack-slot-coloring
 ; GCN-O3-NEXT:       machine-cp
 ; GCN-O3-NEXT:       machinelicm
+; GCN-O3-NEXT:       amdgpu-private-object-vgprs
 ; GCN-O3-NEXT:       si-fix-vgpr-copies
 ; GCN-O3-NEXT:       si-optimize-exec-masking
 ; GCN-O3-NEXT:       remove-redundant-debug-values
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index aabfadd33e976..960cbb1a0def2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,15 +47,14 @@
 ; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O0-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand Atomic instructions
-; GCN-O0-NEXT:      Dominator Tree Construction
-; GCN-O0-NEXT:      Natural Loop Information
-; GCN-O0-NEXT:      AMDGPU VGPR Allocate
 ; GCN-O0-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
 ; GCN-O0-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O0-NEXT:      Expand reduction intrinsics
+; GCN-O0-NEXT:      Dominator Tree Construction
 ; GCN-O0-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O0-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O0-NEXT:    AMDGPU lower intrinsics
@@ -117,7 +116,6 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
-; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Whole Quad Mode
 ; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -132,6 +130,7 @@
 ; GCN-O0-NEXT:        SI Lower WWM Copies
 ; GCN-O0-NEXT:        AMDGPU Reserve WWM Registers
 ; GCN-O0-NEXT:        Fast Register Allocator
+; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Fix VGPR copies
 ; GCN-O0-NEXT:        Remove Redundant DEBUG_VALUE analysis
 ; GCN-O0-NEXT:        Fixup Statepoint Caller Saved
@@ -210,6 +209,7 @@
 ; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O1-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Dominator Tree Construction
 ; GCN-O1-NEXT:      Cycle Info Analysis
@@ -362,7 +362,6 @@
 ; GCN-O1-NEXT:        Live Interval Analysis
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Register Coalescer
-; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
@@ -402,6 +401,7 @@
 ; GCN-O1-NEXT:        Stack Slot Coloring
 ; GCN-O1-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        SI Fix VGPR copies
 ; GCN-O1-NEXT:        SI optimize exec mask operations
 ; GCN-O1-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -502,6 +502,7 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O1-OPTS-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
@@ -680,7 +681,6 @@
 ; GCN-O1-OPTS-NEXT:        Live Interval Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Register Coalescer
-; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
@@ -721,6 +721,7 @@
 ; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
 ; GCN-O1-OPTS-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-OPTS-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        SI Fix VGPR copies
 ; GCN-O1-OPTS-NEXT:        SI optimize exec mask operations
 ; GCN-O1-OPTS-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -822,6 +823,7 @@
 ; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O2-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Cycle Info Analysis
@@ -1003,7 +1005,6 @@
 ; GCN-O2-NEXT:        Live Interval Analysis
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Register Coalescer
-; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        Rewrite Partial Register Uses
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
@@ -1045,6 +1046,7 @@
 ; GCN-O2-NEXT:        Stack Slot Coloring
 ; GCN-O2-NEXT:        Machine Copy Propagation Pass
 ; GCN-O2-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        SI Fix VGPR copies
 ; GCN-O2-NEXT:        SI optimize exec mask operations
 ; GCN-O2-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -1146,6 +1148,7 @@
 ; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O3-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Cycle Info Analysis
@@ -1340,7 +1343,6 @@
 ; GCN-O3-NEXT:        Live Interval Analysis
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Register Coalescer
-; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        Rewrite Partial Register Uses
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
@@ -1382,6 +1384,7 @@
 ; GCN-O3-NEXT:        Stack Slot Coloring
 ; GCN-O3-NEXT:        Machine Copy Propagation Pass
 ; GCN-O3-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        SI Fix VGPR copies
 ; GCN-O3-NEXT:        SI optimize exec mask operations
 ; GCN-O3-NEXT:        Remove Redundant DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 79c11fb2a7c37..4f340c6938ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -55,7 +55,7 @@
 @nullptr12 = global ptr addrspace(12) addrspacecast (ptr null to ptr addrspace(12))
 
 ; CHECK-LABEL: nullptr13:
-; R600-NEXT: .long 0
+; CHECK-NEXT: .long -1
 @nullptr13 = global ptr addrspace(13) addrspacecast (ptr null to ptr addrspace(13))
 
 ; CHECK-LABEL: nullptr14:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index fc5dabc584863..1a73c35f83f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -49,6 +49,7 @@
 ; O0-NEXT: SI Lower WWM Copies
 ; O0-NEXT: AMDGPU Reserve WWM Registers
 ; O0-NEXT: Fast Register Allocator
+; O0-NEXT: AMDGPU Private Object VGPRs
 ; O0-NEXT: SI Fix VGPR copies
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
new file mode 100644
index 0000000000000..50efa3936c365
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; An addrspacecast to or from the "VGPR as memory" address space (13) has no
+; meaningful numeric translation, so it is allowed but lowers to poison rather
+; than being rejected. ptrtoint/inttoptr are also IR-legal; materializing such a
+; pointer on its own (as below) has no defined value, but an actual memory
+; access through an inttoptr value is a real dynamic-indexed access - see
+; @dyn_inttoptr in vgpr-as-memory-dynamic.ll.
+
+ at g = internal addrspace(13) global i32 poison
+
+define ptr @cast_to_generic() {
+; CHECK-LABEL: cast_to_generic:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %c = addrspacecast ptr addrspace(13) @g to ptr
+  ret ptr %c
+}
+
+define ptr addrspace(13) @cast_to_vgpr(ptr %p) {
+; CHECK-LABEL: cast_to_vgpr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %c = addrspacecast ptr %p to ptr addrspace(13)
+  ret ptr addrspace(13) %c
+}
+
+define ptr addrspace(13) @inttoptr_vgpr(i32 %x) {
+; CHECK-LABEL: inttoptr_vgpr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = inttoptr i32 %x to ptr addrspace(13)
+  ret ptr addrspace(13) %p
+}
+
+define i32 @ptrtoint_vgpr(ptr addrspace(13) %p) {
+; CHECK-LABEL: ptrtoint_vgpr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %i = ptrtoint ptr addrspace(13) %p to i32
+  ret i32 %i
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
new file mode 100644
index 0000000000000..4b90e28d51235
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A device function that uses the work-item ID keeps it in the fixed high
+; register v31, while the shared "VGPR as memory" file sits at a low base. The
+; two must not be confused: the file stays below v31 (it does not have to clear
+; it), and the access still lowers to a register copy.
+
+ at g = internal addrspace(13) global i32 poison
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define void @dev() #0 {
+; CHECK-LABEL: dev:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %id, ptr addrspace(13) @g
+  ret void
+}
+
+define amdgpu_kernel void @k() {
+; CHECK-LABEL: k:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 s12, s8
+; CHECK-NEXT:    s_add_u32 s8, s4, 36
+; CHECK-NEXT:    s_mov_b32 s13, s9
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, dev at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, dev at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT:    s_mov_b32 s14, s10
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v31, v0
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT:    s_endpgm
+  call void @dev()
+  ret void
+}
+
+attributes #0 = { noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
new file mode 100644
index 0000000000000..fb763cd31e339
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O2
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A "VGPR as memory" access through a constant-expression GEP must lower to a
+; register copy, not the pc-relative global-address sequence (which previously
+; crashed because addrspace(13) pointers are 32-bit). Exercised at -O0 too,
+; where the address is materialized standalone rather than folded.
+
+ at buf = internal addrspace(13) global [4 x i32] poison
+
+define void @store_constexpr_gep(i32 %v) {
+; O0-LABEL: store_constexpr_gep:
+; O0:       ; %bb.0:
+; O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT:    v_mov_b32_e32 v4, v0
+; O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; O2-LABEL: store_constexpr_gep:
+; O2:       ; %bb.0:
+; O2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT:    v_mov_b32_e32 v4, v0
+; O2-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+  ret void
+}
+
+define i32 @load_constexpr_gep() {
+; O0-LABEL: load_constexpr_gep:
+; O0:       ; %bb.0:
+; O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT:    v_mov_b32_e32 v0, v4
+; O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; O2-LABEL: load_constexpr_gep:
+; O2:       ; %bb.0:
+; O2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT:    v_mov_b32_e32 v0, v4
+; O2-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+  ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
new file mode 100644
index 0000000000000..97a58b4d16436
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
@@ -0,0 +1,346 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s -o /dev/null
+
+; A runtime (non-constant) index into a "VGPR as memory" object becomes an
+; indexed move into the reserved VGPR file: s_set_gpr_idx on gfx9, movrel on
+; gfx10+, with a waterfall loop for a divergent index.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+ at buf8 = internal addrspace(13) global [16 x i8] poison
+ at buf16 = internal addrspace(13) global [16 x i16] poison
+
+define amdgpu_kernel void @dyn_uniform(ptr addrspace(1) %out, i32 %i, i32 %v) {
+; GFX942-LABEL: dyn_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_andn2_b32 s2, s2, -2.0
+; GFX942-NEXT:    v_mov_b32_e32 v18, s3
+; GFX942-NEXT:    s_min_u32 s2, s2, 15
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dword v0, v18, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_and_not1_b32 s2, s2, -2.0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_min_u32 m0, s2, 15
+; GFX11-NEXT:    v_movreld_b32_e32 v2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %i
+  store i32 %v, ptr addrspace(13) %p
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_load(ptr addrspace(1) %out) {
+; GFX942-LABEL: dyn_divergent_load:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_min_u32_e32 v0, 15, v0
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v18, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v1, v18, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_load:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v1, 15, v1
+; GFX11-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v1
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s3
+; GFX11-NEXT:    v_movrels_b32_e32 v18, v2
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v18, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_store(ptr addrspace(1) %out, i32 %v) {
+; GFX942-LABEL: dyn_divergent_store:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_min_u32_e32 v0, 15, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_store:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v0, 15, v0
+; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_movreld_b32_e32 v2, s0
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+; Sub-dword (i8/i16) at a runtime index: the containing dword is read-modify-
+; written with the bit position computed at runtime.
+define amdgpu_kernel void @dyn_i8_uniform(ptr addrspace(1) %out, i32 %i, i8 %v) {
+; GFX942-LABEL: dyn_i8_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s5, s2, 3
+; GFX942-NEXT:    s_and_b32 s4, s3, 0xff
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX942-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX942-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX942-NEXT:    s_lshl_b32 s5, 0xff, s5
+; GFX942-NEXT:    s_min_u32 s2, s2, 3
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    v_not_b32_e32 v6, s5
+; GFX942-NEXT:    v_and_b32_e32 v1, v1, v6
+; GFX942-NEXT:    v_or_b32_e32 v1, s4, v1
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_i8_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s4, s2, 3
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    s_min_u32 m0, s2, 3
+; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s4
+; GFX11-NEXT:    v_movrels_b32_e32 v0, v2
+; GFX11-NEXT:    v_not_b32_e32 v1, s2
+; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    v_mov_b32_e32 v6, s3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX11-NEXT:    global_store_b8 v1, v6, s[0:1]
+; GFX11-NEXT:    v_movreld_b32_e32 v2, v0
+; GFX11-NEXT:    s_endpgm
+  %p = getelementptr [16 x i8], ptr addrspace(13) @buf8, i32 0, i32 %i
+  store i8 %v, ptr addrspace(13) %p
+  %l = load i8, ptr addrspace(13) %p
+  store i8 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_i16_divergent(ptr addrspace(1) %out, i16 %v) {
+; GFX942-LABEL: dyn_i16_divergent:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    v_and_b32_e32 v10, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v10, 4, v10
+; GFX942-NEXT:    s_mov_b32 s2, 0xffff
+; GFX942-NEXT:    v_bfe_u32 v0, v0, 1, 9
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_lshlrev_b32_e64 v11, v10, s2
+; GFX942-NEXT:    v_min_u32_e32 v0, 7, v0
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    v_bfi_b32 v11, v11, 0, v12
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s6, 0xffff
+; GFX942-NEXT:    v_lshl_or_b32 v10, s2, v10, v11
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_3
+; GFX942-NEXT:  ; %bb.4:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_i16_divergent:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 4, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 1, 9
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_lshlrev_b32_e64 v11, v10, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    v_min_u32_e32 v1, 7, v1
+; GFX11-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v1
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s4
+; GFX11-NEXT:    v_movrels_b32_e32 v12, v2
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v11, v11, 0, v12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_lshl_or_b32 v10, s3, v10, v11
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:  .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v1
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s4
+; GFX11-NEXT:    v_movreld_b32_e32 v2, v10
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB4_3
+; GFX11-NEXT:  ; %bb.4:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i16], ptr addrspace(13) @buf16, i32 0, i32 %tid
+  store i16 %v, ptr addrspace(13) %p
+  %l = load i16, ptr addrspace(13) %p
+  store i16 %l, ptr addrspace(1) %out
+  ret void
+}
+
+; An addrspace(13) pointer built with inttoptr is not poison: the integer is the
+; register-relative byte offset, so the access is lowered as a dynamic index
+; (clamped) like any other runtime index. A direct reference to @buf reserves
+; the file for this function.
+define amdgpu_kernel void @dyn_inttoptr(ptr addrspace(1) %out, i32 %off, i32 %v) {
+; GFX942-LABEL: dyn_inttoptr:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    s_min_u32 s2, s2, 15
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_inttoptr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-NEXT:    s_min_u32 m0, s2, 15
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_movreld_b32_e32 v2, s3
+; GFX11-NEXT:    v_mov_b32_e32 v1, v2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  store i32 0, ptr addrspace(13) @buf
+  %p = inttoptr i32 %off to ptr addrspace(13)
+  store i32 %v, ptr addrspace(13) %p
+  %l = load i32, ptr addrspace(13) @buf
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
new file mode 100644
index 0000000000000..78f22f55ea68e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A compile-time index past the end of the "VGPR as memory" file is out of range
+; (it would otherwise select physical registers outside the reserved file), so
+; it is diagnosed rather than miscompiled.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: constant index out of range
+define amdgpu_kernel void @const_oob() {
+  %p = getelementptr i32, ptr addrspace(13) @buf, i32 1000
+  %v = load i32, ptr addrspace(13) %p
+  store i32 %v, ptr addrspace(13) @buf
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
new file mode 100644
index 0000000000000..514ddce90ae9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A dynamic index addresses the whole "VGPR as memory" file as one indexed
+; tuple. A file whose (even-dword-rounded) size has no VGPR tuple class - e.g.
+; 14 dwords - is diagnosed rather than aborting the compiler.
+
+ at buf = internal addrspace(13) global [14 x i32] poison
+
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: VGPR-memory file too large for a dynamic index
+define amdgpu_kernel void @dynamic_toolarge(i32 %i) {
+  %p = getelementptr i32, ptr addrspace(13) @buf, i32 %i
+  %v = load i32, ptr addrspace(13) %p
+  store i32 %v, ptr addrspace(13) @buf
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
new file mode 100644
index 0000000000000..2329336b60553
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; The "VGPR as memory" file lives in low, caller-saved VGPRs that only call-graph
+; members reserve. A call to an external (or indirect) callee would clobber it,
+; so AMDGPULowerModuleVGPRs diagnoses it at the IR level, and the post-RA
+; AMDGPUPrivateObjectVGPRs pass independently diagnoses the (attribute-less)
+; machine call - this also covers calls introduced after the module pass.
+
+ at g = internal addrspace(13) global i32 poison
+
+declare void @ext()
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+; CHECK: error: {{.*}}call to a function that clobbers the 'VGPR as memory' reserved file
+define amdgpu_kernel void @extern_call() {
+  store i32 1, ptr addrspace(13) @g
+  call void @ext()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
new file mode 100644
index 0000000000000..52b38ab0e9255
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; An indirect call cannot be proven to stay within the call graph that reserves
+; the "VGPR as memory" file, so it could clobber the file. AMDGPULowerModuleVGPRs
+; diagnoses it (the callee is unknown at the IR level).
+
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+; CHECK: error: {{.*}}call to a function that clobbers the 'VGPR as memory' reserved file
+define amdgpu_kernel void @indirect_call(ptr %fp) {
+  store i32 1, ptr addrspace(13) @g
+  call void %fp()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
new file mode 100644
index 0000000000000..8dd5890b6170d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; The "VGPR as memory" file is a block of reserved physical VGPRs. Inline asm
+; that explicitly clobbers one of those registers would corrupt the file, so
+; AMDGPUPrivateObjectVGPRs diagnoses it after register allocation, where the
+; reserved registers are final. (For this function the file is at v2.)
+
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: error: {{.*}}inline asm clobbers a 'VGPR as memory' reserved register
+define void @asm_clobber(i32 %v) {
+  store i32 %v, ptr addrspace(13) @g
+  call void asm sideeffect "", "~{v2}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
new file mode 100644
index 0000000000000..06d30cebb03c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A file-using device function that is not reached from any kernel forms a group
+; on its own; a call to a defined function outside that group would clobber the
+; file's reserved registers, so it is diagnosed (not just external/indirect
+; calls).
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @other() {
+  ret void
+}
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+define void @dev_user() {
+  store i32 1, ptr addrspace(13) @g
+  call void @other()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
new file mode 100644
index 0000000000000..4fe434f6a2d18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; Additional "VGPR as memory" accesses the backend cannot lower, each diagnosed
+; rather than reaching instruction selection.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+
+; A dynamic sub-dword access must be naturally aligned so the read-modify-write
+; stays within one dword.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: underaligned sub-dword dynamic access
+define void @underaligned_dyn_subdword(i32 %i, i16 %v) {
+  %p = getelementptr i16, ptr addrspace(13) @buf, i32 %i
+  store i16 %v, ptr addrspace(13) %p, align 1
+  ret void
+}
+
+; A dynamic whole-dword access must be dword aligned.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: misaligned 32-bit dynamic access
+define void @misaligned_dyn_dword(i32 %i, i32 %v) {
+  %p = getelementptr i8, ptr addrspace(13) @buf, i32 %i
+  %p2 = getelementptr i8, ptr addrspace(13) %p, i32 2
+  store i32 %v, ptr addrspace(13) %p2, align 4
+  ret void
+}
+
+; A constant sub-dword field must not straddle a dword boundary.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: sub-dword field crosses a dword boundary
+define i16 @const_subdword_crosses_dword() {
+  %p = getelementptr i8, ptr addrspace(13) @buf, i32 3
+  %v = load i16, ptr addrspace(13) %p, align 1
+  ret i16 %v
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
new file mode 100644
index 0000000000000..7b1b011252846
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; "VGPR as memory" accesses that the backend cannot lower are diagnosed instead
+; of reaching instruction selection as unselectable memory operations.
+
+ at buf = internal addrspace(13) global [16 x i64] poison
+
+; A dynamic index into a wider-than-dword element is unsupported.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: dynamic index wider than 32 bits
+define amdgpu_kernel void @wide_dynamic(i32 %i) {
+  %p = getelementptr i64, ptr addrspace(13) @buf, i32 %i
+  %v = load i64, ptr addrspace(13) %p
+  store i64 %v, ptr addrspace(13) @buf
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
new file mode 100644
index 0000000000000..da0ff1a1a30b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
@@ -0,0 +1,18 @@
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; A "VGPR as memory" global referenced only from an ordinary (non-kernel)
+; function - as IPO might leave after outlining code from a kernel - is still
+; laid out, and the referencing function is annotated. The backend handles
+; direct references to the global from any function, independently of the
+; frontend's placement rules.
+
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: define void @user(i32 %v) #[[ATTR:[0-9]+]]
+define void @user(i32 %v) {
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+; CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-vgpr-memory-base"="{{[0-9]+}}"{{.*}}"amdgpu-vgpr-memory-size"="4"
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
new file mode 100644
index 0000000000000..0dc6dbca45480
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+
+; GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses;
+; fallBackToDAGISel makes such functions fall back to SelectionDAG, which lowers
+; them to register copies rather than crashing in reg-bank legalization.
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
new file mode 100644
index 0000000000000..6da6f49a9e082
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; AMDGPULowerModuleVGPRs lays out addrspace(13) globals into per-call-graph
+; groups: disjoint kernels (@k1/@a, @k2/@b) get independent layouts and bases,
+; while functions that share a global (@writer/@reader/@g, reached from @k3)
+; share one consistent group, so the address resolves to the same registers.
+
+ at a = internal addrspace(13) global [4 x i32] poison
+ at b = internal addrspace(13) global [8 x i32] poison
+ at g = internal addrspace(13) global i32 poison
+
+;.
+; CHECK: @a = internal addrspace(13) global [4 x i32] poison, !amdgpu.vgpr.memory.offset [[META0:![0-9]+]]
+; CHECK: @b = internal addrspace(13) global [8 x i32] poison, !amdgpu.vgpr.memory.offset [[META0]]
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset [[META0]]
+;.
+define amdgpu_kernel void @k1(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT:    store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @k2(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k2(
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT:    store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define void @writer(i32 %v) {
+; CHECK-LABEL: @writer(
+; CHECK-NEXT:    store i32 [[V:%.*]], ptr addrspace(13) @g, align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @reader() {
+; CHECK-LABEL: @reader(
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) @g, align 4
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
+
+define amdgpu_kernel void @k3(ptr addrspace(1) %out, i32 %v) {
+; CHECK-LABEL: @k3(
+; CHECK-NEXT:    call void @writer(i32 [[V:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @reader()
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @writer(i32 %v)
+  %r = call i32 @reader()
+  store i32 %r, ptr addrspace(1) %out
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="16" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="32" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="4" }
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
new file mode 100644
index 0000000000000..44193d15016f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; Sub-dword (i8/i16) "VGPR as memory" accesses at a constant index are realized
+; as a read-modify-write of the containing dword (shifts and masks), since
+; registers have no sub-dword addressing.
+
+ at b = internal addrspace(13) global [8 x i8] poison
+ at h = internal addrspace(13) global [4 x i16] poison
+
+define void @store_i8(i8 %v) {
+; CHECK-LABEL: store_i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff00ff, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+  store i8 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define i8 @load_i8() {
+; CHECK-LABEL: load_i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+  %l = load i8, ptr addrspace(13) %p
+  ret i8 %l
+}
+
+define void @store_i16(i16 %v) {
+; CHECK-LABEL: store_i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+  store i16 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define signext i16 @load_i16_sext() {
+; CHECK-LABEL: load_i16_sext:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+  %l = load i16, ptr addrspace(13) %p
+  ret i16 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
new file mode 100644
index 0000000000000..3c7347c0baaae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" (addrspace(13)) accesses at a constant index lower to plain
+; register copies to/from the reserved VGPR file - never to scratch or buffer
+; memory - and writer/reader of the same global resolve to the same register.
+
+ at g = internal addrspace(13) global i32 poison
+ at arr = internal addrspace(13) global [4 x i32] poison
+ at g64 = internal addrspace(13) global i64 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
+
+define void @store_arr(i32 %v) {
+; CHECK-LABEL: store_arr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define i32 @load_arr() {
+; CHECK-LABEL: load_arr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+  %l = load i32, ptr addrspace(13) %p
+  ret i32 %l
+}
+
+define void @store_i64(i64 %v) {
+; CHECK-LABEL: store_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i64 %v, ptr addrspace(13) @g64
+  ret void
+}
+
+define i64 @load_i64() {
+; CHECK-LABEL: load_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, v3
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i64, ptr addrspace(13) @g64
+  ret i64 %l
+}
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index bd760de79c9d0..3ca15083959ad 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,24 +2,26 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: %alloca.13 = alloca i32, align 4, addrspace(13)
 define void @static_alloca() {
 entry:
   %alloca.0 = alloca i32, align 4
@@ -36,23 +38,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i32(i32 %n) {
 entry:
@@ -69,23 +71,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i64(i64 %n) {
 entry:
diff --git a/llvm/test/Verifier/AMDGPU/vgpr-memory.ll b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
new file mode 100644
index 0000000000000..406f77ca17599
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
@@ -0,0 +1,49 @@
+; RUN: not llvm-as %s --disable-output 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; A "VGPR as memory" global is register-backed: it has no defined initial
+; contents and per-lane storage, so it cannot be statically initialized or
+; atomically accessed. An addrspacecast to/from addrspace(13) is allowed but
+; lowers to poison (it has no meaningful numeric address), and likewise
+; ptrtoint/inttoptr are allowed, so neither is diagnosed here.
+
+; CHECK: atomic operations on the VGPR address space (13) are not allowed
+; CHECK-NEXT: atomicrmw add ptr addrspace(13) @valid.poison
+; CHECK: atomic operations on the VGPR address space (13) are not allowed
+; CHECK-NEXT: %v = load atomic i32, ptr addrspace(13) @valid.poison
+; CHECK: intrinsic with a VGPR address space (13) pointer argument is not allowed
+; CHECK-NEXT: call void @llvm.memcpy
+; CHECK: intrinsic with a VGPR address space (13) pointer argument is not allowed
+; CHECK-NEXT: call void @llvm.memset
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.init
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.zeroinit
+
+; A poison initializer (or none) is fine.
+ at valid.poison = internal addrspace(13) global i32 poison
+ at valid.array = internal addrspace(13) global [4 x i32] poison
+
+ at bad.init = internal addrspace(13) global i32 7
+ at bad.zeroinit = internal addrspace(13) global [2 x i32] zeroinitializer
+
+define void @atomic_rmw() {
+  atomicrmw add ptr addrspace(13) @valid.poison, i32 1 seq_cst
+  ret void
+}
+
+define i32 @atomic_load() {
+  %v = load atomic i32, ptr addrspace(13) @valid.poison seq_cst, align 4
+  ret i32 %v
+}
+
+define void @memcpy_vgpr(ptr %src) {
+  call void @llvm.memcpy.p13.p0.i64(ptr addrspace(13) @valid.poison, ptr %src, i64 16, i1 false)
+  ret void
+}
+
+define void @memset_vgpr() {
+  call void @llvm.memset.p13.i64(ptr addrspace(13) @valid.poison, i8 0, i64 16, i1 false)
+  ret void
+}
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index a082adbf6565e..0ec3c753c10f1 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -43,14 +43,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
   // and that ANDGCN adds p7 and p8 as well.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
             "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
             "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   // Check that the old AMDGCN p8:128:128 definition is upgraded
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
             "m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   // but that r600 does not.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
             "m:e-e-p:32:32-G1");
@@ -66,7 +66,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
       "m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
       "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
       "1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
-      "128:48-p9:192:256:256:32");
+      "128:48-p9:192:256:256:32-p13:32:32");
 
   // Check that SystemZ adds -S64 if needed.
   EXPECT_EQ(UpgradeDataLayoutString(
@@ -158,24 +158,24 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
   EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
             "m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
             "m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
             "m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
 
   // Check that AMDGCN targets don't add already declared address space 7.
-  EXPECT_EQ(
-      UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
-      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
-      "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
-      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
+            "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
+  EXPECT_EQ(UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
+            "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
+            "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
 
   // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
   // flag.
@@ -216,9 +216,9 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
 
   // Check that AMDGPU targets add G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("", "amdgcn"),
-      "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+  EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
+            "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
 
   // Check that SPIR & SPIRV targets add G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");