[clang] [llvm] [AMDGPU] Add initial support for VGPR as memory (PR #205435)

Fri Jun 26 09:01:14 PDT 2026

https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/205435

>From 587f5182a78051d248866b3d8995a5c43ab54878 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 19 Jun 2026 14:05:08 -0500
Subject: [PATCH 1/3] Add initial support for VGPR as memory

---
 clang/include/clang/Basic/Attr.td             |   8 +
 clang/include/clang/Basic/AttrDocs.td         |  20 ++
 .../clang/Basic/DiagnosticCommonKinds.td      |   5 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   3 +
 clang/include/clang/Sema/SemaAMDGPU.h         |   1 +
 clang/lib/CodeGen/CGDecl.cpp                  |  41 ++-
 clang/lib/Sema/SemaAMDGPU.cpp                 |  14 ++
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +
 .../CodeGenHIP/amdgpu-vgpr-O0-warning.hip     |  14 ++
 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip    |  19 ++
 ...a-attribute-supported-attributes-list.test |   1 +
 clang/test/SemaCUDA/amdgpu-vgpr.cu            |  28 +++
 llvm/include/llvm/Support/AMDGPUAddrSpace.h   |   4 +
 llvm/lib/IR/VerifierAMDGPU.cpp                |   6 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  16 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +++++++++++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   1 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp       | 145 +++++++++++
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 233 ++++++++++++++++--
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  28 ++-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  19 ++
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  12 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  11 +
 .../AMDGPU/amdgpu-vgpr-allocate-basic.ll      | 109 ++++++++
 .../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll     |  20 ++
 .../CodeGen/AMDGPU/as-vgpr-alloca-static.ll   |  58 +++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   1 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   9 +-
 llvm/test/Verifier/AMDGPU/alloca.ll           |  55 +++--
 31 files changed, 985 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
 create mode 100644 clang/test/SemaCUDA/amdgpu-vgpr.cu
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f1ae66bd7f2bb..51a91a432957d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2521,6 +2521,14 @@ def AMDGPUMaxNumWorkGroups : InheritableAttr {
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
+def AMDGPUVGPR : InheritableAttr {
+  let Spellings = [Clang<"amdgpu_vgpr">];
+  let Documentation = [AMDGPUVGPRDocs];
+  let Subjects = SubjectList<[LocalVar], ErrorDiag>;
+  // Only meaningful in CUDA/HIP; semantic checks restrict it to kernel locals.
+  let LangOpts = [CUDA];
+}
+
 def BPFPreserveAccessIndex : InheritableAttr,
                              TargetSpecificAttr<TargetBPF>  {
   let Spellings = [Clang<"preserve_access_index">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7c1c88241aaa8..b80265a1aec1d 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3604,6 +3604,26 @@ An error will be given if:
   }];
 }
 
+def AMDGPUVGPRDocs : Documentation {
+  let Category = DocCatAMDGPUAttributes;
+  let Content = [{
+This attribute requests that a kernel-local variable be allocated in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
+so that accesses with statically known indices lower to vector register
+copies instead of scratch memory traffic.
+
+Clang supports the ``__attribute__((amdgpu_vgpr))`` or
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
+local variables declared in a ``__global__`` (kernel) function; applying it to
+a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
+error.
+
+Known limitation: the request is only honored with optimizations enabled. At
+``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
+emitted.
+  }];
+}
+
 def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
   let Content = [{
 Clang supports several different calling conventions, depending on the target
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..fe03be43c80c7 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,6 +319,11 @@ def warn_stack_protection_ignore_attribute : Warning<
   "'stack_protector_ignore' attribute ignored due to "
   "'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
 
+def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
+  "%0 is not guaranteed to keep the variable in vector registers at -O0; "
+  "it may fall back to scratch memory">,
+  InGroup<DiagGroup<"amdgpu-vgpr">>;
+
 def warn_slh_does_not_support_asm_goto : Warning<
   "speculative load hardening does not protect functions with asm goto">,
   InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index cde99dfb16ec5..9d52492b8ce64 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3710,6 +3710,9 @@ def err_attribute_argument_invalid : Error<
 def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
   "'amdgpu_flat_work_group_size' attribute must match "
   "'reqd_work_group_size' product">;
+def err_amdgpu_vgpr_not_kernel_local : Error<
+  "%0 attribute can only be applied to local variables in "
+  "'__global__' (kernel) functions">;
 def err_attribute_argument_is_zero : Error<
   "%0 attribute must be greater than 0">;
 def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index a6205534e0de3..9cb74ed74f4b9 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -79,6 +79,7 @@ class SemaAMDGPU : public SemaBase {
   void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
   void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+  void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
 
   /// Expand a valid use of the feature identification builtins into its
   /// corresponding sequence of instructions.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 7608f8cb6fc7a..bca2d11d47c6a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 #include <optional>
 
 using namespace clang;
@@ -1601,9 +1602,37 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // Create the alloca.  Note that we set the name separately from
       // building the instruction so that it's there even in no-asserts
       // builds.
-      address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
-                                 allocaAlignment, D.getName(),
-                                 /*ArraySize=*/nullptr, &AllocaAddr);
+      //
+      // "VGPR as memory" objects keep their backing registers only once the
+      // optimizing register allocator runs. At -O0 the backend cannot lower
+      // these accesses (e.g. when the address escapes a basic block), so the
+      // request is not honored: fall back to an ordinary (scratch) alloca and
+      // warn, matching the documented behavior.
+      // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
+      // backing tuple to scratch) so this fallback can be removed.
+      const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+      const bool UseVGPRMemory =
+          VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
+      if (VGPRAttr && !UseVGPRMemory)
+        CGM.getDiags().Report(D.getLocation(),
+                              diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
+            << VGPRAttr;
+
+      if (UseVGPRMemory) {
+        // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
+        // address space so that statically indexed accesses lower to vector
+        // register copies instead of scratch memory.
+        auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
+                                        /*ArraySize=*/nullptr, D.getName(),
+                                        AllocaInsertPt->getIterator());
+        AI->setAlignment(allocaAlignment.getAsAlign());
+        AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+        address = AllocaAddr;
+      } else {
+        address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
+                                   allocaAlignment, D.getName(),
+                                   /*ArraySize=*/nullptr, &AllocaAddr);
+      }
 
       // Don't emit lifetime markers for MSVC catch parameters. The lifetime of
       // the catch parameter starts in the catchpad instruction, and we can't
@@ -1612,8 +1641,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
           D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
 
       // Emit a lifetime intrinsic if meaningful. There's no point in doing this
-      // if we don't have a valid insertion point (?).
-      if (HaveInsertPoint() && !IsMSCatchParam) {
+      // if we don't have a valid insertion point (?). "VGPR as memory" allocas
+      // live in a non-alloca address space, so the standard lifetime markers
+      // (which assume the alloca address space) are skipped for them.
+      if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
         // If there's a jump into the lifetime of this variable, its lifetime
         // gets broken up into several regions in IR, which requires more work
         // to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 29442617b6a13..b741b9e7f1e24 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/AST/Expr.h"
@@ -626,6 +627,19 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
   addAMDGPUFlatWorkGroupSizeAttr(D, AL, MinExpr, MaxExpr);
 }
 
+void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
+  // The LocalVar subject list already guarantees this is a local variable.
+  // Restrict it further to locals declared directly in a __global__ kernel;
+  // it is meaningless (and an error) in __device__ or host functions.
+  const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
+  if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
+    Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+    return;
+  }
+
+  D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
+}
+
 static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
                                            Expr *MaxExpr,
                                            const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2159c586e5738..095a11acdd02d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7641,6 +7641,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_AMDGPUNumVGPR:
     S.AMDGPU().handleAMDGPUNumVGPRAttr(D, AL);
     break;
+  case ParsedAttr::AT_AMDGPUVGPR:
+    S.AMDGPU().handleAMDGPUVGPRAttr(D, AL);
+    break;
   case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
     S.AMDGPU().handleAMDGPUMaxNumWorkGroupsAttr(D, AL);
     break;
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
new file mode 100644
index 0000000000000..4d23008b8ef43
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// At -O0 "VGPR as memory" is not honored: the variable falls back to an
+// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
+
+#define __global__ __attribute__((global))
+
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
new file mode 100644
index 0000000000000..9a5c38e48951c
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -emit-llvm -O1 -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
+// memory" address space (addrspace(13)), and its accesses stay in that space.
+
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
+// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
+// CHECK: store i32 %{{.*}}, ptr addrspace(13)
+// CHECK: load i32, ptr addrspace(13)
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr));
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 03b9a77ec1814..69cc257aa3120 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -7,6 +7,7 @@
 // CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
 // CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUVGPR (SubjectMatchRule_variable_is_local)
 // CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
 // CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
 // CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
new file mode 100644
index 0000000000000..6ad3074921b9b
--- /dev/null
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+__global__ void kernel() {
+  int ok[4] __attribute__((amdgpu_vgpr)); // OK
+  (void)ok;
+}
+
+__device__ void device_fn() {
+  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+  (void)bad;
+}
+
+__host__ void host_fn() {
+  int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+  (void)bad;
+}
+
+// Not a local variable.
+int global_var __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+
+__global__ void takes_no_args() {
+  // Attribute does not accept arguments.
+  int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
+  (void)bad;
+}
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 01b1510524d0f..e9d3add54d054 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -47,6 +47,10 @@ enum : unsigned {
   BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
                               ///< pointers with an additional index.
 
+  VGPR = 13, ///< Address space for "VGPR as memory": objects backed by VGPRs
+             ///< rather than scratch. Shares its numeric value with the
+             ///< graphics-only CONSTANT_BUFFER_5 alias below.
+
   RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
 
   /// Internal address spaces. Can be freely renumbered.
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index 04cb214ef2520..de9a0c7bef132 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -122,8 +122,10 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
   if (!VS.TT.isAMDGPU())
     return;
 
-  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
-    VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+      AI.getAddressSpace() != AMDGPUAS::VGPR)
+    VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
+                   &AI);
 }
 
 bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c6dd1dbb62449..3336ea6d1f943 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca();
+FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
@@ -276,6 +276,20 @@ struct AMDGPUPromoteAllocaPass
   TargetMachine &TM;
 };
 
+void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+extern char &AMDGPUPrivateObjectVGPRsID;
+
+// Allocates pre-existing VGPR address space allocas without performing any
+// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
+// objects remain functional.
+struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
+  AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+  TargetMachine &TM;
+};
+
 struct AMDGPUPromoteAllocaToVectorPass
     : OptionalPassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
   AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7330f3b13f3cb..8e289058a2ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
 #include "R600RegisterInfo.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -341,25 +343,159 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  if (!Subtarget->d16PreservesUnusedBits())
-    return;
+// Resolve the constant byte offset within the per-function VGPR file for a
+// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
+// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
+// object.
+static std::optional<unsigned>
+getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
+  unsigned ExtraOffset = 0;
+  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
+      ExtraOffset = C->getZExtValue();
+      Ptr = Ptr.getOperand(0);
+    }
+  }
+  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+  if (!FI)
+    return std::nullopt;
+  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+    return std::nullopt;
+  return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+}
+
+// Lower a load/store of a "VGPR as memory" object into one
+// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
+// offset. The pseudos are later expanded into subregister copies by
+// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
+// split into their dword lanes; sub-dword and non-dword-multiple accesses are
+// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
+// true if \p N was rewritten.
+bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
+  if (auto *Load = dyn_cast<LoadSDNode>(N)) {
+    if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
+        Load->getExtensionType() != ISD::NON_EXTLOAD)
+      return false;
+    EVT VT = Load->getValueType(0);
+    unsigned Bits = VT.getFixedSizeInBits();
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    std::optional<unsigned> Offset =
+        getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
+    if (!Offset || (*Offset % 4 != 0))
+      return false;
+
+    SDLoc DL(N);
+    unsigned NumDwords = Bits / 32;
+    SmallVector<SDValue, 4> Dwords;
+    SmallVector<SDValue, 4> Chains;
+    for (unsigned I = 0; I != NumDwords; ++I) {
+      SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+                       Load->getChain()};
+      MachineSDNode *Lane = CurDAG->getMachineNode(
+          AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
+      if (I == 0)
+        CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
+      Dwords.push_back(SDValue(Lane, 0));
+      Chains.push_back(SDValue(Lane, 1));
+    }
+
+    SDValue Val;
+    if (NumDwords == 1) {
+      Val = Dwords[0];
+      if (VT != MVT::i32)
+        Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
+    } else {
+      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+      SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
+      Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
+    }
+    SDValue Chain = NumDwords == 1 ? Chains[0]
+                                   : CurDAG->getNode(ISD::TokenFactor, DL,
+                                                     MVT::Other, Chains);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
+    return true;
+  }
+
+  if (auto *Store = dyn_cast<StoreSDNode>(N)) {
+    if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
+        Store->isTruncatingStore())
+      return false;
+    SDValue Val = Store->getValue();
+    EVT VT = Val.getValueType();
+    unsigned Bits = VT.getFixedSizeInBits();
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    std::optional<unsigned> Offset =
+        getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
+    if (!Offset || (*Offset % 4 != 0))
+      return false;
+
+    SDLoc DL(N);
+    unsigned NumDwords = Bits / 32;
+    SmallVector<SDValue, 4> Dwords;
+    if (NumDwords == 1) {
+      if (VT != MVT::i32)
+        Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
+      Dwords.push_back(Val);
+    } else {
+      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+      SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
+      for (unsigned I = 0; I != NumDwords; ++I)
+        Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                         Vec,
+                                         CurDAG->getConstant(I, DL, MVT::i32)));
+    }
+
+    SmallVector<SDValue, 4> Chains;
+    for (unsigned I = 0; I != NumDwords; ++I) {
+      SDValue Ops[] = {Dwords[I],
+                       CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+                       Store->getChain()};
+      MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
+                                                   DL, MVT::Other, Ops);
+      if (I == 0)
+        CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
+      Chains.push_back(SDValue(Lane, 0));
+    }
+    SDValue Chain = NumDwords == 1 ? Chains[0]
+                                   : CurDAG->getNode(ISD::TokenFactor, DL,
+                                                     MVT::Other, Chains);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
+    return true;
+  }
 
-  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+  return false;
+}
 
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   bool MadeChange = false;
-  while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--Position;
-    if (N->use_empty())
-      continue;
-
-    switch (N->getOpcode()) {
-    case ISD::BUILD_VECTOR:
-      // TODO: Match load d16 from shl (extload:i16), 16
-      MadeChange |= matchLoadD16FromBuildVector(N);
-      break;
-    default:
-      break;
+
+  // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
+  // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
+  SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
+  while (VGPRPos != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--VGPRPos;
+    MadeChange |= rewriteVGPRFrameAccess(N);
+  }
+
+  if (Subtarget->d16PreservesUnusedBits()) {
+    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+    while (Position != CurDAG->allnodes_begin()) {
+      SDNode *N = &*--Position;
+      if (N->use_empty())
+        continue;
+
+      switch (N->getOpcode()) {
+      case ISD::BUILD_VECTOR:
+        // TODO: Match load d16 from shl (extload:i16), 16
+        MadeChange |= matchLoadD16FromBuildVector(N);
+        break;
+      default:
+        break;
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 95f85a6151375..cf62874912742 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,6 +67,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   bool matchLoadD16FromBuildVector(SDNode *N) const;
+  bool rewriteVGPRFrameAccess(SDNode *N);
   void PreprocessISelDAG() override;
   void Select(SDNode *N) override;
   void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 2a6560b309e62..b377704c2f296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
+FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
new file mode 100644
index 0000000000000..a3a1cf6f18bed
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -0,0 +1,145 @@
+//===-- AMDGPUPrivateObjectVGPRs.cpp - Lower VGPR-as-memory accesses ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
+/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
+/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
+/// carries a constant byte offset, which selects the dword (subregister) to
+/// copy.
+///
+/// This runs once the function is out of SSA form (so the single backing tuple
+/// can be defined by several subregister copies) and while LiveIntervals is
+/// available. The backing tuple has lane-divergent liveness (its subregisters
+/// are written and read independently), which the whole-register LiveVariables
+/// analysis cannot represent; the pass therefore updates the subregister-aware
+/// LiveIntervals directly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-private-object-vgprs"
+
+namespace {
+
+class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Private Object VGPRs";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addPreserved<LiveIntervalsWrapperPass>();
+    AU.addPreserved<SlotIndexesWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+                "AMDGPU Private Object VGPRs", false, false)
+
+char AMDGPUPrivateObjectVGPRs::ID = 0;
+
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+
+bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Collect the pseudos and determine how many dwords the backing tuple needs.
+  SmallVector<MachineInstr *, 8> Worklist;
+  unsigned NumDwords = 0;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opc = MI.getOpcode();
+      if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
+          Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+        continue;
+      unsigned ByteOffset = MI.getOperand(1).getImm();
+      NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+      Worklist.push_back(&MI);
+    }
+  }
+
+  if (Worklist.empty())
+    return false;
+
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+  const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
+  assert(RC && "no VGPR register class for VGPR-as-memory object");
+  Register Storage = MRI.createVirtualRegister(RC);
+
+  // Define the whole tuple up front so partial (subregister) writes and reads
+  // of uninitialized lanes are well formed.
+  MachineBasicBlock &Entry = MF.front();
+  MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
+                                 TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
+  LIS->InsertMachineInstrInMaps(*ImpDef);
+
+  for (MachineInstr *MI : Worklist) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    const DebugLoc &DL = MI->getDebugLoc();
+    unsigned Dword = MI->getOperand(1).getImm() / 4;
+    unsigned SubReg = NumDwords == 1
+                          ? AMDGPU::NoSubRegister
+                          : SIRegisterInfo::getSubRegFromChannel(Dword);
+
+    MachineInstr *Copy;
+    if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+      Register Dst = MI->getOperand(0).getReg();
+      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
+                 .addReg(Storage, {}, SubReg);
+    } else {
+      Register Src = MI->getOperand(0).getReg();
+      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
+                 .addReg(Storage, RegState::Define, SubReg)
+                 .addReg(Src);
+    }
+    // The copy takes the pseudo's slot, so the intervals of the copied
+    // load/store operand stay valid.
+    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+    MI->eraseFromParent();
+  }
+
+  // The backing tuple is brand new; compute its (subregister) live interval.
+  LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
+
+  // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
+  // form disconnected value-number components within the single tuple, which an
+  // individual live interval must not contain. Split them into separate
+  // virtual registers, exactly as the register coalescer does for the intervals
+  // it leaves behind.
+  SmallVector<LiveInterval *, 4> SplitLIs;
+  LIS->splitSeparateComponents(LI, SplitLIs);
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 95e06dc8295d9..32ab847c8d8f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -138,6 +139,7 @@ class AMDGPUPromoteAllocaImpl {
   unsigned MaxVGPRs;
   unsigned VGPRBudgetRatio;
   unsigned MaxVectorRegs;
+  unsigned AllocVGPROffset = 0;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -162,6 +164,10 @@ class AMDGPUPromoteAllocaImpl {
   void analyzePromoteToVector(AllocaAnalysis &AA) const;
   void promoteAllocaToVector(AllocaAnalysis &AA);
   void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+
+  /// Allocate an alloca that already lives in the VGPR address space to a range
+  /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
+  void allocateVgprs(AllocaAnalysis &AA);
   bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
                              SetVector<IntrinsicInst *> &DeferredIntrs);
   void
@@ -179,7 +185,11 @@ class AMDGPUPromoteAllocaImpl {
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
   }
 
-  bool run(Function &F, bool PromoteToLDS);
+  /// IsLatePass is true when invoked as a codegen pass and false when invoked
+  /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
+  /// requests only the work strictly required for functionality (i.e. VGPR
+  /// allocation), skipping the optimization-oriented promotions.
+  bool run(Function &F, bool IsLatePass, bool NoOpt);
 };
 
 // FIXME: This can create globals so should be a module pass.
@@ -187,26 +197,34 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 public:
   static char ID;
 
-  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+  explicit AMDGPUPromoteAlloca(
+      CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+      : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
       return AMDGPUPromoteAllocaImpl(
                  TPC->getTM<TargetMachine>(),
                  getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
-          .run(F, /*PromoteToLDS*/ true);
+          .run(F, /*IsLatePass=*/true, NoOpt);
+    }
     return false;
   }
 
-  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+  StringRef getPassName() const override {
+    return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
+  }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
+
+private:
+  bool NoOpt;
 };
 
 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -253,7 +271,8 @@ char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
+                                                     /*NoOpt=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -265,7 +284,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/false,
+                                                     /*NoOpt=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -274,8 +294,21 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PreservedAnalyses::all();
 }
 
-FunctionPass *llvm::createAMDGPUPromoteAlloca() {
-  return new AMDGPUPromoteAlloca();
+PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
+                                                     /*NoOpt=*/true);
+  if (Changed) {
+    PreservedAnalyses PA;
+    PA.preserveSet<CFGAnalyses>();
+    return PA;
+  }
+  return PreservedAnalyses::all();
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
+  return new AMDGPUPromoteAlloca(OptLevel);
 }
 
 bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -368,9 +401,110 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
     VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
 }
 
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
-  if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
+// A "VGPR as memory" object can only be realized in registers today when every
+// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
+// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
+// indexing and escaping addresses need gfx13 support, which is not yet
+// available; such objects fall back to scratch instead.
+//
+// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
+// support so this fallback is no longer needed.
+static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
+                                            const DataLayout &DL) {
+  // An access is lowerable if it covers a whole number of dwords and starts at
+  // a dword-aligned constant offset from the alloca.
+  auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
+    if (!Simple)
+      return false;
+    uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
+    if (Bits == 0 || Bits % 32 != 0)
+      return false;
+    APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+    const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
+        DL, Off, /*AllowNonInbounds=*/true);
+    return Base == &AI && Off.urem(4) == 0;
+  };
+
+  SmallVector<const Use *, 16> Worklist;
+  for (const Use &U : AI.uses())
+    Worklist.push_back(&U);
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    const User *Usr = U->getUser();
+
+    if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+      for (const Use &GU : GEP->uses())
+        Worklist.push_back(&GU);
+      continue;
+    }
+    if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
+      if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
+        return false;
+      continue;
+    }
+    if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
+      // The pointer must be the address operand, not a stored value (escape).
+      if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
+        return false;
+      if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
+                    SI->isSimple()))
+        return false;
+      continue;
+    }
+    // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
+    // otherwise not statically lowerable.
     return false;
+  }
+  return true;
+}
+
+// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
+// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
+// falls back to ordinary scratch.
+static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
+  SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
+  for (Use *U : Uses) {
+    User *Usr = U->getUser();
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+      IRBuilder<> B(GEP);
+      SmallVector<Value *, 4> Indices(GEP->indices());
+      Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
+                                  GEP->getName(), GEP->getNoWrapFlags());
+      rewriteVGPRPointerToScratch(GEP, NewGEP);
+      GEP->eraseFromParent();
+      continue;
+    }
+    if (auto *II = dyn_cast<IntrinsicInst>(Usr);
+        II && II->isLifetimeStartOrEnd()) {
+      II->eraseFromParent();
+      continue;
+    }
+    // Loads, stores, address-space casts and call arguments only need this
+    // operand repointed; their result types do not depend on the operand's
+    // address space.
+    U->set(New);
+  }
+}
+
+static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
+  auto *NewAI = new AllocaInst(
+      AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
+      AI->getAlign(), AI->getName(), AI->getIterator());
+  NewAI->setDebugLoc(AI->getDebugLoc());
+  rewriteVGPRPointerToScratch(AI, NewAI);
+  AI->eraseFromParent();
+}
+
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
+  assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
+  if (!IsLatePass && DisablePromoteAllocaToVector)
+    return false;
+
+  bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
+  bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
 
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -379,6 +513,12 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
   setFunctionLimits(F);
 
+  // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
+  // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  const bool TargetSupportsVGPRAsMemory =
+      ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
+
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
@@ -395,8 +535,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
       LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
 
       AllocaAnalysis AA{AI};
+      if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
+        // Allocas that already live in the VGPR address space only need to be
+        // assigned VGPRs, which is required for functionality.
+        if (IsLatePass)
+          Allocas.push_back(std::move(AA));
+        continue;
+      }
+      if (!PromoteToVector && !PromoteToLDS)
+        continue;
       if (collectAllocaUses(AA)) {
-        analyzePromoteToVector(AA);
+        if (PromoteToVector)
+          analyzePromoteToVector(AA);
         if (PromoteToLDS)
           analyzePromoteToLDS(AA);
         if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -407,8 +557,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
     }
   }
 
-  stable_sort(Allocas,
-              [](const auto &A, const auto &B) { return A.Score > B.Score; });
+  stable_sort(Allocas, [](const auto &A, const auto &B) {
+    // Prioritize pre-existing VGPR allocas, since their allocation must not
+    // fail.
+    bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+    bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+    if (AIsVGPR != BIsVGPR)
+      return AIsVGPR;
+    return A.Score > B.Score;
+  });
 
   // clang-format off
   LLVM_DEBUG(
@@ -421,6 +578,39 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   bool Changed = false;
   SetVector<IntrinsicInst *> DeferredIntrs;
   for (AllocaAnalysis &AA : Allocas) {
+    if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
+      // Fall back to scratch (and warn) when the object can't be kept in
+      // registers, so the program still compiles correctly: either the target
+      // does not support "VGPR as memory", or the access pattern (dynamic
+      // index, sub-dword, escaping address) is not yet supported.
+      const char *Unsupported = nullptr;
+      if (!TargetSupportsVGPRAsMemory)
+        Unsupported = "not supported on this target";
+      else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
+        Unsupported = "dynamic indexing, sub-dword access, or escaping address "
+                      "is not yet supported";
+      if (Unsupported) {
+        F.getContext().diagnose(DiagnosticInfoUnsupported(
+            F,
+            Twine("'amdgpu_vgpr' object could not be kept in vector registers "
+                  "(") +
+                Unsupported + "); using scratch memory instead",
+            AA.Alloca->getDebugLoc(), DS_Warning));
+        demoteVGPRAllocaToScratch(AA.Alloca);
+        Changed = true;
+        continue;
+      }
+      const unsigned AllocaCost =
+          AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
+      allocateVgprs(AA);
+      // Account for the consumed VGPRs in the vectorization budget.
+      if (VectorizationBudget > AllocaCost)
+        VectorizationBudget -= AllocaCost;
+      else
+        VectorizationBudget = 0;
+      Changed = true;
+      continue;
+    }
     if (AA.Vector.Ty) {
       std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
       assert(Size); // Expected to succeed on non-array alloca.
@@ -455,6 +645,21 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
   return Changed;
 }
 
+void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
+  LLVMContext &Ctx = Mod->getContext();
+  const unsigned AllocaSize =
+      DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
+
+  // Record where the object was allocated within the VGPR file.
+  Type *I32 = Type::getInt32Ty(Ctx);
+  AA.Alloca->setMetadata(
+      "amdgpu.allocated.vgprs",
+      MDNode::get(
+          Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
+                ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
+  AllocVGPROffset += alignTo(AllocaSize, 4);
+}
+
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ae6e6d0bdcd1e..5814862a514b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -668,6 +668,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSILowerSGPRSpillsLegacyPass(*PR);
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesLegacyPass(*PR);
+  initializeAMDGPUPrivateObjectVGPRsPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1500,9 +1501,12 @@ void AMDGPUPassConfig::addIRPasses() {
 
   addPass(createAtomicExpandLegacyPass());
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
-    addPass(createAMDGPUPromoteAlloca());
+  // With optimizations enabled, do the full promotion of allocas. Without
+  // optimizations, this only allocates pre-existing VGPR address space allocas,
+  // which is required for functionality.
+  addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
 
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses();
 
@@ -1717,6 +1721,11 @@ void GCNPassConfig::addFastRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
+  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+  // At O0 there is no register coalescer; anchor on TwoAddress, where
+  // LiveIntervals is already available.
+  insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
+
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
   TargetPassConfig::addFastRegAlloc();
@@ -1743,6 +1752,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
+  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+  // This runs after the coalescer so it does not perturb the kill flags that
+  // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
+  // the LiveIntervals the register allocator consumes next.
+  insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
+
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
 
@@ -2283,8 +2298,15 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
 
   addFunctionPass(AtomicExpandPass(TM), PMW);
 
-  if (TM.getOptLevel() > CodeGenOptLevel::None) {
+  // With optimizations enabled, do the full promotion of allocas. Without
+  // optimizations, only allocate pre-existing VGPR address space allocas, which
+  // is required for functionality.
+  if (TM.getOptLevel() > CodeGenOptLevel::None)
     addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
+  else
+    addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
+
+  if (TM.getOptLevel() > CodeGenOptLevel::None) {
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses(PMW);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 46edc44e2cc05..dd25ab71997d7 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -100,6 +100,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUPreloadKernArgProlog.cpp
   AMDGPUPreloadKernelArguments.cpp
   AMDGPUPrintfRuntimeBinding.cpp
+  AMDGPUPrivateObjectVGPRs.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPUPromoteKernelArguments.cpp
   AMDGPURegBankCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 750cb1973e21f..3594caef86782 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,6 +1243,25 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
 }
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
+// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
+// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
+// within the per-function VGPR file. They are produced during instruction
+// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
+// pass before register allocation.
+let hasSideEffects = 0 in {
+def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
+                                        (ins i32imm:$offset)> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
+                                         (ins VGPR_32:$vdata, i32imm:$offset)> {
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+} // End hasSideEffects = 0
+
 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 // needs to be used and an extra instruction to move between VGPR and AGPR.
 // UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96571dd028b14..7528cd2a009a3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1779,6 +1780,17 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
   return false;
 }
 
+AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
+  const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
+  assert(MD && MD->getNumOperands() == 2 &&
+         "expected !amdgpu.allocated.vgprs metadata with 2 operands");
+  unsigned Address =
+      mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+  unsigned Size =
+      mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+  return {Address, Size};
+}
+
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1623dc72d2810..b34dde7cb2cd7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -30,6 +30,7 @@ struct amd_kernel_code_t;
 namespace llvm {
 
 struct Align;
+class AllocaInst;
 class Argument;
 class Function;
 class GlobalValue;
@@ -1032,6 +1033,16 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
+/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
+/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
+/// to within the VGPR file, and its size in bytes.
+struct AllocatedVGPRsMetadata {
+  unsigned Address;
+  unsigned Size;
+
+  static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
+};
+
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
new file mode 100644
index 0000000000000..f6c64c5121867
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
+
+define void @vgpr_alloca() {
+; CHECK-LABEL: define void @vgpr_alloca(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  ret void
+}
+
+define void @vgpr_alloca_multiple() {
+; CHECK-LABEL: define void @vgpr_alloca_multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
+; CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[B]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i32, align 4, addrspace(13)
+  %b = alloca [2 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  store i32 0, ptr addrspace(13) %b
+  ret void
+}
+
+define void @private_alloca_unchanged() {
+; CHECK-LABEL: define void @private_alloca_unchanged(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT:    store i64 42, ptr addrspace(5) [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i64], align 4, addrspace(5)
+  store i64 42, ptr addrspace(5) %a
+  ret void
+}
+
+declare void @use(ptr)
+
+; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
+; back to ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
+; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
+; CHECK-NEXT:    store i32 [[V]], ptr addrspace(5) [[P2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+; A VGPR object whose address escapes (here via a cast to a generic pointer, as
+; the frontend emits) cannot be kept in registers yet, so it falls back to
+; ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_escaping() {
+; CHECK-LABEL: define void @vgpr_alloca_escaping(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
+; CHECK-NEXT:    call void @use(ptr [[CAST]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %cast = addrspacecast ptr addrspace(13) %a to ptr
+  call void @use(ptr %cast)
+  ret void
+}
+
+; Whole-dword-multiple accesses (here i64) stay in VGPRs.
+define void @vgpr_alloca_i64(i64 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_i64(
+; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
+; CHECK-NEXT:    store i64 [[V]], ptr addrspace(13) [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i64, align 8, addrspace(13)
+  store i64 %v, ptr addrspace(13) %a
+  ret void
+}
+
+; Sub-dword accesses are not supported yet, so the object falls back to scratch.
+define void @vgpr_alloca_subdword(i16 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_subdword(
+; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
+; CHECK-NEXT:    store i16 [[V]], ptr addrspace(5) [[A1]], align 2
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [2 x i16], align 4, addrspace(13)
+  store i16 %v, ptr addrspace(13) %a
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0, i32 16}
+; CHECK: [[META1]] = !{i32 0, i32 4}
+; CHECK: [[META2]] = !{i32 4, i32 8}
+; CHECK: [[META3]] = !{i32 0, i32 8}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
new file mode 100644
index 0000000000000..63ba44b479279
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -0,0 +1,20 @@
+; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
+; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
+; scratch.
+
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+
+define void @vgpr_obj() {
+; SUPP:   alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
+; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  store i32 0, ptr addrspace(13) %a
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
new file mode 100644
index 0000000000000..ea914907a900d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
@@ -0,0 +1,58 @@
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
+; indices must lower to register copies, never to scratch/buffer memory traffic.
+
+; CHECK-LABEL: store_load_i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_i32(i32 %v) {
+  %a = alloca i32, align 4, addrspace(13)
+  store i32 %v, ptr addrspace(13) %a
+  %l = load i32, ptr addrspace(13) %a
+  %r = add i32 %l, 1
+  ret i32 %r
+}
+
+; CHECK-LABEL: store_load_array:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_array(i32 %v) {
+  %a = alloca [4 x i32], align 4, addrspace(13)
+  %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
+  %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
+  store i32 %v, ptr addrspace(13) %p1
+  store i32 7, ptr addrspace(13) %p3
+  %l1 = load i32, ptr addrspace(13) %p1
+  %l3 = load i32, ptr addrspace(13) %p3
+  %s = add i32 %l1, %l3
+  ret i32 %s
+}
+
+; A 64-bit (two-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_i64:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i64 @store_load_i64(i64 %v) {
+  %a = alloca i64, align 8, addrspace(13)
+  store i64 %v, ptr addrspace(13) %a
+  %l = load i64, ptr addrspace(13) %a
+  %r = add i64 %l, 1
+  ret i64 %r
+}
+
+; A vector (four-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_v4i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
+  %a = alloca <4 x i32>, align 16, addrspace(13)
+  store <4 x i32> %v, ptr addrspace(13) %a
+  %l = load <4 x i32>, ptr addrspace(13) %a
+  ret <4 x i32> %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 73f807e9d55c5..94173fb7b11d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -29,6 +29,7 @@
 ; GCN-O0-NEXT: amdgpu-lower-module-lds
 ; GCN-O0-NEXT: function
 ; GCN-O0-NEXT:   atomic-expand
+; GCN-O0-NEXT:   amdgpu-vgpr-allocate
 ; GCN-O0-NEXT:   verify
 ; GCN-O0-NEXT:   unreachableblockelim
 ; GCN-O0-NEXT:   ee-instrument<post-inline>
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 070c873798647..aabfadd33e976 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,11 +49,13 @@
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand Atomic instructions
+; GCN-O0-NEXT:      Dominator Tree Construction
+; GCN-O0-NEXT:      Natural Loop Information
+; GCN-O0-NEXT:      AMDGPU VGPR Allocate
 ; GCN-O0-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
 ; GCN-O0-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O0-NEXT:      Expand reduction intrinsics
-; GCN-O0-NEXT:      Dominator Tree Construction
 ; GCN-O0-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O0-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O0-NEXT:    AMDGPU lower intrinsics
@@ -115,6 +117,7 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
+; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Whole Quad Mode
 ; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -359,6 +362,7 @@
 ; GCN-O1-NEXT:        Live Interval Analysis
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Register Coalescer
+; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
@@ -676,6 +680,7 @@
 ; GCN-O1-OPTS-NEXT:        Live Interval Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Register Coalescer
+; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
@@ -998,6 +1003,7 @@
 ; GCN-O2-NEXT:        Live Interval Analysis
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Register Coalescer
+; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        Rewrite Partial Register Uses
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
@@ -1334,6 +1340,7 @@
 ; GCN-O3-NEXT:        Live Interval Analysis
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Register Coalescer
+; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        Rewrite Partial Register Uses
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index f31d6228d7936..bd760de79c9d0 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,23 +2,23 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
 define void @static_alloca() {
 entry:
@@ -32,26 +32,27 @@ entry:
   %alloca.7 = alloca i32, align 4, addrspace(7)
   %alloca.8 = alloca i32, align 4, addrspace(8)
   %alloca.9 = alloca i32, align 4, addrspace(9)
+  %alloca.13 = alloca i32, align 4, addrspace(13)
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i32(i32 %n) {
 entry:
@@ -68,23 +69,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i64(i64 %n) {
 entry:

>From 8e5a8ec027c51b92143f95404e3ad6f068300742 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Wed, 24 Jun 2026 19:32:40 -0500
Subject: [PATCH 2/3] Lower VGPR-as-memory accesses via REG_LOAD/REG_STORE
 nodes

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 405 ++++++------------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   3 +-
 .../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp       |  70 ++-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  10 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 121 +++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       | 113 +++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  13 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  51 ++-
 .../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll     |   3 +-
 9 files changed, 405 insertions(+), 384 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e289058a2ed1..66c9353cd5c33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -68,7 +68,8 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
 
   SDValue Srl = In.getOperand(0);
   if (Srl.getOpcode() == ISD::SRL) {
-    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+    if (ConstantSDNode *ShiftAmt =
+            dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
       if (ShiftAmt->getZExtValue() == 16) {
         Out = stripBitcast(Srl.getOperand(0));
         return true;
@@ -284,22 +285,20 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
 
     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
-    SDValue Ops[] = {
-      LdHi->getChain(), LdHi->getBasePtr(), TiedIn
-    };
+    SDValue Ops[] = {LdHi->getChain(), LdHi->getBasePtr(), TiedIn};
 
     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
     if (LdHi->getMemoryVT() == MVT::i8) {
-      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
-        AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
+      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD
+                   ? AMDGPUISD::LOAD_D16_HI_I8
+                   : AMDGPUISD::LOAD_D16_HI_U8;
     } else {
       assert(LdHi->getMemoryVT() == MVT::i16);
     }
 
     SDValue NewLoadHi =
-      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
-                                  Ops, LdHi->getMemoryVT(),
-                                  LdHi->getMemOperand());
+        CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, Ops,
+                                    LdHi->getMemoryVT(), LdHi->getMemOperand());
 
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
@@ -318,22 +317,20 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
     if (LdLo->getMemoryVT() == MVT::i8) {
-      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
-        AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
+      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD
+                   ? AMDGPUISD::LOAD_D16_LO_I8
+                   : AMDGPUISD::LOAD_D16_LO_U8;
     } else {
       assert(LdLo->getMemoryVT() == MVT::i16);
     }
 
     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
 
-    SDValue Ops[] = {
-      LdLo->getChain(), LdLo->getBasePtr(), TiedIn
-    };
+    SDValue Ops[] = {LdLo->getChain(), LdLo->getBasePtr(), TiedIn};
 
     SDValue NewLoadLo =
-      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
-                                  Ops, LdLo->getMemoryVT(),
-                                  LdLo->getMemOperand());
+        CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, Ops,
+                                    LdLo->getMemoryVT(), LdLo->getMemOperand());
 
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
@@ -343,144 +340,9 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
   return false;
 }
 
-// Resolve the constant byte offset within the per-function VGPR file for a
-// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
-// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
-// object.
-static std::optional<unsigned>
-getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
-  unsigned ExtraOffset = 0;
-  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
-    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
-      ExtraOffset = C->getZExtValue();
-      Ptr = Ptr.getOperand(0);
-    }
-  }
-  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
-  if (!FI)
-    return std::nullopt;
-  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
-  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
-    return std::nullopt;
-  return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
-}
-
-// Lower a load/store of a "VGPR as memory" object into one
-// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
-// offset. The pseudos are later expanded into subregister copies by
-// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
-// split into their dword lanes; sub-dword and non-dword-multiple accesses are
-// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
-// true if \p N was rewritten.
-bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
-  if (auto *Load = dyn_cast<LoadSDNode>(N)) {
-    if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
-        Load->getExtensionType() != ISD::NON_EXTLOAD)
-      return false;
-    EVT VT = Load->getValueType(0);
-    unsigned Bits = VT.getFixedSizeInBits();
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    std::optional<unsigned> Offset =
-        getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
-    if (!Offset || (*Offset % 4 != 0))
-      return false;
-
-    SDLoc DL(N);
-    unsigned NumDwords = Bits / 32;
-    SmallVector<SDValue, 4> Dwords;
-    SmallVector<SDValue, 4> Chains;
-    for (unsigned I = 0; I != NumDwords; ++I) {
-      SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
-                       Load->getChain()};
-      MachineSDNode *Lane = CurDAG->getMachineNode(
-          AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
-      if (I == 0)
-        CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
-      Dwords.push_back(SDValue(Lane, 0));
-      Chains.push_back(SDValue(Lane, 1));
-    }
-
-    SDValue Val;
-    if (NumDwords == 1) {
-      Val = Dwords[0];
-      if (VT != MVT::i32)
-        Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
-    } else {
-      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
-      SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
-      Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
-    }
-    SDValue Chain = NumDwords == 1 ? Chains[0]
-                                   : CurDAG->getNode(ISD::TokenFactor, DL,
-                                                     MVT::Other, Chains);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
-    return true;
-  }
-
-  if (auto *Store = dyn_cast<StoreSDNode>(N)) {
-    if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
-        Store->isTruncatingStore())
-      return false;
-    SDValue Val = Store->getValue();
-    EVT VT = Val.getValueType();
-    unsigned Bits = VT.getFixedSizeInBits();
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    std::optional<unsigned> Offset =
-        getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
-    if (!Offset || (*Offset % 4 != 0))
-      return false;
-
-    SDLoc DL(N);
-    unsigned NumDwords = Bits / 32;
-    SmallVector<SDValue, 4> Dwords;
-    if (NumDwords == 1) {
-      if (VT != MVT::i32)
-        Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
-      Dwords.push_back(Val);
-    } else {
-      EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
-      SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
-      for (unsigned I = 0; I != NumDwords; ++I)
-        Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
-                                         Vec,
-                                         CurDAG->getConstant(I, DL, MVT::i32)));
-    }
-
-    SmallVector<SDValue, 4> Chains;
-    for (unsigned I = 0; I != NumDwords; ++I) {
-      SDValue Ops[] = {Dwords[I],
-                       CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
-                       Store->getChain()};
-      MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
-                                                   DL, MVT::Other, Ops);
-      if (I == 0)
-        CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
-      Chains.push_back(SDValue(Lane, 0));
-    }
-    SDValue Chain = NumDwords == 1 ? Chains[0]
-                                   : CurDAG->getNode(ISD::TokenFactor, DL,
-                                                     MVT::Other, Chains);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
-    return true;
-  }
-
-  return false;
-}
-
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
   bool MadeChange = false;
 
-  // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
-  // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
-  SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
-  while (VGPRPos != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--VGPRPos;
-    MadeChange |= rewriteVGPRFrameAccess(N);
-  }
-
   if (Subtarget->d16PreservesUnusedBits()) {
     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
     while (Position != CurDAG->allnodes_begin()) {
@@ -501,8 +363,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
 
   if (MadeChange) {
     CurDAG->RemoveDeadNodes();
-    LLVM_DEBUG(dbgs() << "After PreProcess:\n";
-               CurDAG->dump(););
+    LLVM_DEBUG(dbgs() << "After PreProcess:\n"; CurDAG->dump(););
   }
 }
 
@@ -524,8 +385,8 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
-const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
-                                                          unsigned OpNo) const {
+const TargetRegisterClass *
+AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
   if (!N->isMachineOpcode()) {
     if (N->getOpcode() == ISD::CopyToReg) {
       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
@@ -563,14 +424,14 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
     SDValue SubRegOp = N->getOperand(OpNo + 1);
     unsigned SubRegIdx = SubRegOp->getAsZExtVal();
     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
-                                                              SubRegIdx);
+                                                               SubRegIdx);
   }
   }
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
                                          SDValue Glue) const {
-  SmallVector <SDValue, 8> Ops;
+  SmallVector<SDValue, 8> Ops;
   Ops.push_back(NewChain); // Replace the chain.
   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
     Ops.push_back(N->getOperand(i));
@@ -580,8 +441,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
+  const SITargetLowering &Lowering =
+      *static_cast<const SITargetLowering *>(getTargetLowering());
 
   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
 
@@ -598,8 +459,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
     MachineFunction &MF = CurDAG->getMachineFunction();
     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
-    return
-        glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+    return glueCopyToM0(N,
+                        CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
   }
   return N;
 }
@@ -681,7 +542,7 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   }
 
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
-                                  "supported yet");
+                                "supported yet");
   // 32 = Max Num Vector Elements
   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
   // 1 = Vector Register Class
@@ -707,8 +568,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   if (NOps != NumVectorElts) {
     // Fill in the missing undef elements if this was a scalar_to_vector.
     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
-    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                   DL, EltVT);
+    MachineSDNode *ImpDef =
+        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
                                  i * EltSizeInRegs, EltSizeInRegs)
@@ -817,7 +678,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return;   // Already selected.
+    return; // Already selected.
   }
 
   // isa<MemSDNode> almost works but is slightly too permissive for some DS
@@ -905,8 +766,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     } else {
       llvm_unreachable("Unhandled value type for BUILD_PAIR");
     }
-    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
-                            N->getOperand(1), SubReg1 };
+    const SDValue Ops[] = {RC, N->getOperand(0), SubReg0, N->getOperand(1),
+                           SubReg1};
     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
                                           N->getValueType(0), Ops));
     return;
@@ -957,8 +818,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     uint32_t OffsetVal = Offset->getZExtValue();
     uint32_t WidthVal = Width->getZExtValue();
 
-    ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
-                            WidthVal));
+    ReplaceNode(
+        N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
     return;
   }
   case AMDGPUISD::DIV_SCALE: {
@@ -974,8 +835,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::UMUL_LOHI:
     return SelectMUL_LOHI(N);
   case ISD::CopyToReg: {
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    const SITargetLowering &Lowering =
+        *static_cast<const SITargetLowering *>(getTargetLowering());
     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
@@ -1003,7 +864,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (N->getValueType(0) == MVT::i32) {
       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
-                              { N->getOperand(0), N->getOperand(1) });
+                              {N->getOperand(0), N->getOperand(1)});
       SelectCode(N);
       return;
     }
@@ -1166,7 +1027,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
     Base = Addr.getOperand(0);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else {
@@ -1179,9 +1040,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
 
 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
                                                        const SDLoc &DL) const {
-  SDNode *Mov = CurDAG->getMachineNode(
-    AMDGPU::S_MOV_B32, DL, MVT::i32,
-    CurDAG->getTargetConstant(Val, DL, MVT::i32));
+  SDNode *Mov =
+      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+                             CurDAG->getTargetConstant(Val, DL, MVT::i32));
   return SDValue(Mov, 0);
 }
 
@@ -1322,7 +1183,8 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
-  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp,
+  //  omod
   SDValue Ops[10];
 
   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -1360,8 +1222,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
 
   assert(VT == MVT::f32 || VT == MVT::f64);
 
-  unsigned Opc
-    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
+  unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64
+                                  : AMDGPU::V_DIV_SCALE_F32_e64;
 
   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
   // omod
@@ -1388,8 +1250,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
 
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
-  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
-                    Clamp };
+  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp};
 
   if (UseNoCarry) {
     MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
@@ -1468,7 +1329,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
     // sub C, x -> add (sub 0, x), C
-    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+    if (const ConstantSDNode *C =
+            dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
       int64_t ByteOffset = C->getSExtValue();
       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1476,8 +1338,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
         // the known bits in isDSOffsetLegal. We need to emit the selected node
         // here, so this is thrown away.
-        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
-                                      Zero, Addr.getOperand(1));
+        SDValue Sub =
+            CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
 
         if (isDSOffsetLegal(Sub, ByteOffset)) {
           SmallVector<SDValue, 3> Opnds;
@@ -1511,8 +1373,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                 DL, MVT::i32, Zero);
+      MachineSDNode *MovZero =
+          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
       return true;
@@ -1817,8 +1679,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   if (C->getSExtValue()) {
     SDLoc DL(Addr);
 
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    const SITargetLowering &Lowering =
+        *static_cast<const SITargetLowering *>(getTargetLowering());
 
     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
     return true;
@@ -1827,7 +1689,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return false;
 }
 
-std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+std::pair<SDValue, SDValue>
+AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   SDLoc DL(N);
 
   auto *FI = dyn_cast<FrameIndexSDNode>(N);
@@ -1841,9 +1704,9 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
   return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
-                                                 SDValue Addr, SDValue &Rsrc,
-                                                 SDValue &VAddr, SDValue &SOffset,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
+                                                 SDValue &Rsrc, SDValue &VAddr,
+                                                 SDValue &SOffset,
                                                  SDValue &ImmOffset) const {
 
   SDLoc DL(Addr);
@@ -1861,8 +1724,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
       const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
       SDValue HighBits =
           CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
-      MachineSDNode *MovHighBits = CurDAG->getMachineNode(
-        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+      MachineSDNode *MovHighBits =
+          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
       VAddr = SDValue(MovHighBits, 0);
 
       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1918,8 +1781,7 @@ static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
   return RC && TRI.isSGPRClass(RC);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
-                                                  SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
                                                   SDValue &SRsrc,
                                                   SDValue &SOffset,
                                                   SDValue &Offset) const {
@@ -1962,8 +1824,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &SOffset, SDValue &Offset
-                                           ) const {
+                                           SDValue &SOffset,
+                                           SDValue &Offset) const {
   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
   const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
@@ -1977,8 +1839,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                     maskTrailingOnes<uint64_t>(32); // Size
     SDLoc DL(Addr);
 
-    const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    const SITargetLowering &Lowering =
+        *static_cast<const SITargetLowering *>(getTargetLowering());
 
     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
     return true;
@@ -1999,14 +1861,14 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
 
 // Find a load or store from corresponding pattern root.
 // Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
-  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+static MemSDNode *findMemSDNode(SDNode *N) {
+  N = AMDGPUTargetLowering::stripBitcast(SDValue(N, 0)).getNode();
   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
     return MN;
   assert(isa<BuildVectorSDNode>(N));
   for (SDValue V : N->op_values())
     if (MemSDNode *MN =
-          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+            dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
       return MN;
   llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
@@ -2387,8 +2249,8 @@ static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
     // Materialize this into a scalar move for scalar address to avoid
     // readfirstlane.
     auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
-                                              FI->getValueType(0));
+    SDValue TFI =
+        CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
                                            MVT::i32, TFI, SAddr.getOperand(1)),
                     0);
@@ -2488,8 +2350,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
 
       if (isUInt<32>(RemainderOffset)) {
         SDNode *VMov = CurDAG->getMachineNode(
-          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
-          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+            AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+            CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
         if (!isFlatScratchBaseLegal(Addr))
@@ -2677,16 +2539,16 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
 
   const SDValue Ops[] = {
-    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
-    Addr,
-    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
-    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
-            0),
-    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+      CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+      Addr,
+      CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+              0),
+      CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
   };
 
-  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
-                                        Ops), 0);
+  return SDValue(
+      CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, Ops), 0);
 }
 
 // Match a base and an immediate (if Offset is not null) or an SGPR (if
@@ -2822,8 +2684,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                               /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
-                                            SDValue &Base,
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base,
                                             SDValue &Offset) const {
   SDLoc DL(Index);
 
@@ -2888,7 +2749,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
     if (0 < BVal && BVal <= CVal && CVal < 32) {
       bool Signed = N->getOpcode() == ISD::SRA;
       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
-                  32 - CVal));
+                              32 - CVal));
       return;
     }
   }
@@ -2933,7 +2794,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = llvm::popcount(MaskVal);
           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
-                      WidthVal));
+                                  WidthVal));
           return;
         }
       }
@@ -3028,8 +2889,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   SDValue Cond = N->getOperand(1);
 
   if (Cond.isUndef()) {
-    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
-                         N->getOperand(2), N->getOperand(0));
+    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, N->getOperand(2),
+                         N->getOperand(0));
     return;
   }
 
@@ -3129,8 +2990,8 @@ void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
   // be copied to an SGPR with readfirstlane.
-  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
-    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? AMDGPU::DS_APPEND
+                                                       : AMDGPU::DS_CONSUME;
 
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(2);
@@ -3156,10 +3017,8 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
   }
 
   SDValue Ops[] = {
-    Offset,
-    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
-    Chain,
-    N->getOperand(N->getNumOperands() - 1) // New glue
+      Offset, CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), Chain,
+      N->getOperand(N->getNumOperands() - 1) // New glue
   };
 
   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -3282,14 +3141,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
     // Prefer to do the shift in an SGPR since it should be possible to use m0
     // as the result directly. If it's already an SGPR, it will be eliminated
     // later.
-    SDNode *SGPROffset
-      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
-                               BaseOffset);
+    SDNode *SGPROffset = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
+                                                MVT::i32, BaseOffset);
     // Shift to offset in m0
-    SDNode *M0Base
-      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
-                               SDValue(SGPROffset, 0),
-                               CurDAG->getTargetConstant(16, SL, MVT::i32));
+    SDNode *M0Base = CurDAG->getMachineNode(
+        AMDGPU::S_LSHL_B32, SL, MVT::i32, SDValue(SGPROffset, 0),
+        CurDAG->getTargetConstant(16, SL, MVT::i32));
     glueCopyToM0(N, SDValue(M0Base, 0));
   }
 
@@ -3369,27 +3226,27 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
 
   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
 
-  SDNode *InterpMov =
-    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
-        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
-        N->getOperand(3),  // Attr
-        N->getOperand(2),  // Attrchan
-        ToM0.getValue(1) // In glue
-  });
-
-  SDNode *InterpP1LV =
-    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
-        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
-        N->getOperand(1), // Src0
-        N->getOperand(3), // Attr
-        N->getOperand(2), // Attrchan
-        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
-        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
-        N->getOperand(4), // high
-        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
-        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
-        SDValue(InterpMov, 1)
-  });
+  SDNode *InterpMov = CurDAG->getMachineNode(
+      AMDGPU::V_INTERP_MOV_F32, DL, VTs,
+      {
+          CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+          N->getOperand(3),                           // Attr
+          N->getOperand(2),                           // Attrchan
+          ToM0.getValue(1)                            // In glue
+      });
+
+  SDNode *InterpP1LV = CurDAG->getMachineNode(
+      AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32,
+      {CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+       N->getOperand(1),                           // Src0
+       N->getOperand(3),                           // Attr
+       N->getOperand(2),                           // Attrchan
+       CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+       SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+       N->getOperand(4),      // high
+       CurDAG->getTargetConstant(0, DL, MVT::i1),  // $clamp
+       CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+       SDValue(InterpMov, 1)});
 
   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
 }
@@ -3516,8 +3373,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
-  SDValue Log2WaveSize =
-    CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
+  SDValue Log2WaveSize = CurDAG->getTargetConstant(
+      Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
   CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
                        {N->getOperand(0), Log2WaveSize});
 }
@@ -3790,14 +3647,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
 
     if (Lo.getValueSizeInBits() > VecSize) {
       Lo = CurDAG->getTargetExtractSubreg(
-        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
-        MVT::getIntegerVT(VecSize), Lo);
+          (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+          MVT::getIntegerVT(VecSize), Lo);
     }
 
     if (Hi.getValueSizeInBits() > VecSize) {
       Hi = CurDAG->getTargetExtractSubreg(
-        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
-        MVT::getIntegerVT(VecSize), Hi);
+          (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+          MVT::getIntegerVT(VecSize), Hi);
     }
 
     assert(Lo.getValueSizeInBits() <= VecSize &&
@@ -3837,15 +3694,18 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
                 TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};
 
         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
-                                             Src.getValueType(), Ops), 0);
+                                             Src.getValueType(), Ops),
+                      0);
       }
       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
       return true;
     }
 
     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
-      uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
-                      .bitcastToAPInt().getZExtValue();
+      uint64_t Lit = cast<ConstantFPSDNode>(Lo)
+                         ->getValueAPF()
+                         .bitcastToAPInt()
+                         .getZExtValue();
       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
@@ -4527,7 +4387,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     //                          1     0     1
     //                          1     1     0
     //                          1     1     1
-    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+    const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
 
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->isAllOnes()) {
@@ -4588,8 +4448,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     SDValue RHS = In.getOperand(1);
 
     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
-    if (!getOperandBits(LHS, LHSBits) ||
-        !getOperandBits(RHS, RHSBits)) {
+    if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
       Src = std::move(Backup);
       return std::make_pair(0, 0);
     }
@@ -4783,7 +4642,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
     SDLoc SL(In);
     return CurDAG->getConstant(
-      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+        C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
   }
 
   SDValue Src;
@@ -4793,7 +4652,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   return SDValue();
 }
 
-bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
   assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
 
   const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
@@ -4802,7 +4661,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
   unsigned Limit = 0;
   bool AllUsesAcceptSReg = true;
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-    Limit < 10 && U != E; ++U, ++Limit) {
+       Limit < 10 && U != E; ++U, ++Limit) {
     const TargetRegisterClass *RC =
         getOperandRegClass(U->getUser(), U->getOperandNo());
 
@@ -4872,8 +4731,8 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-  const AMDGPUTargetLowering& Lowering =
-    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+  const AMDGPUTargetLowering &Lowering =
+      *static_cast<const AMDGPUTargetLowering *>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index cf62874912742..a06c15594bf0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,7 +67,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
 
   bool runOnMachineFunction(MachineFunction &MF) override;
   bool matchLoadD16FromBuildVector(SDNode *N) const;
-  bool rewriteVGPRFrameAccess(SDNode *N);
   void PreprocessISelDAG() override;
   void Select(SDNode *N) override;
   void PostprocessISelDAG() override;
@@ -274,7 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                  SDValue &SrcMods) const;
 
   bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
-                   SDValue &Tbl) const;
+                    SDValue &Tbl) const;
 
   SDValue getHi16Elt(SDValue In) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index a3a1cf6f18bed..d8ff923619193 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -38,6 +38,50 @@ using namespace llvm;
 
 namespace {
 
+static bool isVGPRFrameLoad(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B32:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B64:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B96:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B128:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B160:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B192:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B224:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B256:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B288:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B320:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B352:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B384:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B512:
+  case AMDGPU::SI_VGPR_FRAME_LOAD_B1024:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool isVGPRFrameStore(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::SI_VGPR_FRAME_STORE_B32:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B64:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B96:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B128:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B160:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B192:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B224:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B256:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B288:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B320:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B352:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B384:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B512:
+  case AMDGPU::SI_VGPR_FRAME_STORE_B1024:
+    return true;
+  default:
+    return false;
+  }
+}
+
 class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
 public:
   static char ID;
@@ -75,16 +119,20 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Collect the pseudos and determine how many dwords the backing tuple needs.
+  // Each pseudo carries a constant dword index and accesses as many dwords as
+  // its data register class is wide.
   SmallVector<MachineInstr *, 8> Worklist;
   unsigned NumDwords = 0;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       unsigned Opc = MI.getOpcode();
-      if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
-          Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+      if (!isVGPRFrameLoad(Opc) && !isVGPRFrameStore(Opc))
         continue;
-      unsigned ByteOffset = MI.getOperand(1).getImm();
-      NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+      unsigned Dword = MI.getOperand(1).getImm();
+      unsigned AccessDwords =
+          TRI->getRegSizeInBits(*MRI.getRegClass(MI.getOperand(0).getReg())) /
+          32;
+      NumDwords = std::max(NumDwords, Dword + AccessDwords);
       Worklist.push_back(&MI);
     }
   }
@@ -108,13 +156,17 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
   for (MachineInstr *MI : Worklist) {
     MachineBasicBlock &MBB = *MI->getParent();
     const DebugLoc &DL = MI->getDebugLoc();
-    unsigned Dword = MI->getOperand(1).getImm() / 4;
-    unsigned SubReg = NumDwords == 1
-                          ? AMDGPU::NoSubRegister
-                          : SIRegisterInfo::getSubRegFromChannel(Dword);
+    unsigned Dword = MI->getOperand(1).getImm();
+    unsigned AccessDwords =
+        TRI->getRegSizeInBits(*MRI.getRegClass(MI->getOperand(0).getReg())) /
+        32;
+    unsigned SubReg =
+        (Dword == 0 && AccessDwords == NumDwords)
+            ? AMDGPU::NoSubRegister
+            : SIRegisterInfo::getSubRegFromChannel(Dword, AccessDwords);
 
     MachineInstr *Copy;
-    if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+    if (isVGPRFrameLoad(MI->getOpcode())) {
       Register Dst = MI->getOperand(0).getReg();
       Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
                  .addReg(Storage, {}, SubReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 32ab847c8d8f3..478f54b7cdfc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -404,11 +404,11 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
 // A "VGPR as memory" object can only be realized in registers today when every
 // access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
 // bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses need gfx13 support, which is not yet
-// available; such objects fall back to scratch instead.
+// indexing and escaping addresses are not yet supported; such objects fall back
+// to scratch instead.
 //
-// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
-// support so this fallback is no longer needed.
+// TODO: Lower dynamically-indexed / escaping VGPR objects so this fallback is no
+// longer needed.
 static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
                                             const DataLayout &DL) {
   // An access is lowerable if it covers a whole number of dwords and starts at
@@ -514,7 +514,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   setFunctionLimits(F);
 
   // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
-  // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+  // GFX12 and later. On any other target the objects fall back to scratch.
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   const bool TargetSupportsVGPRAsMemory =
       ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4a32b81b06ff5..db2301ba28359 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6457,7 +6457,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
           else
             ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
         }
-        ClampInstr.addReg(Src0);              // src0
+        ClampInstr.addReg(Src0); // src0
         if (isFPOp)
           ClampInstr.addImm(SISrcMods::NONE); // src1 mod
         ClampInstr.addReg(Src1);              // src1
@@ -12432,18 +12432,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     case 12:
       if (!Subtarget->hasLDSLoadB96_B128())
         return SDValue();
-      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
-                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
-                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
-                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+                                      : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+            : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+                         : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
       break;
     case 16:
       if (!Subtarget->hasLDSLoadB96_B128())
         return SDValue();
-      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
-                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
-                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
-                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+                                      : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+            : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+                         : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
       break;
     }
 
@@ -12473,11 +12473,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
             ? 1
             : 0,
-        DL, MVT::i8));                                           // swz
+        DL, MVT::i8)); // swz
     Ops.push_back(
         DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
-    Ops.push_back(M0Val.getValue(0));                            // Chain
-    Ops.push_back(M0Val.getValue(1));                            // Glue
+    Ops.push_back(M0Val.getValue(0)); // Chain
+    Ops.push_back(M0Val.getValue(1)); // Glue
 
     auto *M = cast<MemSDNode>(Op);
     auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
@@ -12555,7 +12555,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Ops.push_back(VOffset);
     }
 
-    Ops.push_back(Op.getOperand(5));  // Offset
+    Ops.push_back(Op.getOperand(5)); // Offset
 
     unsigned Aux = Op.getConstantOperandVal(6);
     Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
@@ -14330,6 +14330,92 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
 }
 
+/// Lower a load/store of a "VGPR as memory" object (an alloca in
+/// AMDGPUAS::VGPR) into an AMDGPUISD::REG_{LOAD,STORE} node carrying the
+/// constant dword index of the access within the per-function VGPR file. These
+/// nodes are selected into register copies via the SI_VGPR_FRAME_* pseudos and
+/// the AMDGPUPrivateObjectVGPRs pass.
+///
+/// Returns SDValue() if the access cannot (yet) be resolved to a constant file
+/// offset; such objects are demoted to scratch by AMDGPUPromoteAlloca, so any
+/// access that survives to here is expected to fold to a constant offset.
+SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  MemSDNode *MemOp = cast<MemSDNode>(Op);
+  const MachineFunction &MF = DAG.getMachineFunction();
+  SDLoc DL(Op);
+
+  // Resolve the constant byte offset of the access within the VGPR file
+  // directly from the frame index (plus a constant GEP offset); the frame index
+  // itself is not custom-lowered.
+  SDValue Ptr = MemOp->getBasePtr();
+  unsigned ExtraOffset = 0;
+  if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+    auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1));
+    if (!C)
+      return SDValue();
+    ExtraOffset = C->getZExtValue();
+    Ptr = Ptr.getOperand(0);
+  }
+  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+  if (!FI)
+    return SDValue();
+  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+    return SDValue();
+  unsigned ByteOffset =
+      AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+  if (ByteOffset % 4 != 0)
+    return SDValue();
+
+  EVT MemVT = MemOp->getMemoryVT();
+  unsigned BitWidth = MemVT.getSizeInBits();
+  // Only whole-dword accesses are kept in registers; sub-dword and
+  // non-dword-multiple objects are demoted to scratch by AMDGPUPromoteAlloca.
+  if (BitWidth == 0 || BitWidth % 32 != 0)
+    return SDValue();
+  if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
+    return SDValue();
+
+  // Bail out for types we cannot handle (extending loads, truncating stores).
+  if (auto *Load = dyn_cast<LoadSDNode>(MemOp)) {
+    if (Load->getExtensionType() != ISD::NON_EXTLOAD)
+      return SDValue();
+  } else if (cast<StoreSDNode>(MemOp)->isTruncatingStore()) {
+    return SDValue();
+  }
+
+  // Use a register-legal i32 / vector-of-i32 view so a single node covers the
+  // whole access; bitcast through it when the memory type is not register
+  // legal (e.g. v4i8 has already been excluded above).
+  EVT RegVT = MemVT;
+  if (!isTypeLegal(RegVT)) {
+    unsigned NumDwords = BitWidth / 32;
+    RegVT = NumDwords == 1
+                ? EVT(MVT::i32)
+                : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumDwords);
+  }
+
+  SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+  SDValue Chain = MemOp->getChain();
+  if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+    SDValue Value = StoreOp->getValue();
+    if (RegVT != MemVT)
+      Value = DAG.getNode(ISD::BITCAST, DL, RegVT, Value);
+    return DAG.getMemIntrinsicNode(
+        AMDGPUISD::REG_STORE, DL, DAG.getVTList(MVT::Other),
+        {Chain, Value, Index}, MemVT, StoreOp->getMemOperand());
+  }
+
+  SDValue NewLoad = DAG.getMemIntrinsicNode(
+      AMDGPUISD::REG_LOAD, DL, DAG.getVTList(RegVT, MVT::Other), {Chain, Index},
+      MemVT, MemOp->getMemOperand());
+  if (RegVT == MemVT)
+    return NewLoad;
+  SDValue Value = DAG.getNode(ISD::BITCAST, DL, MemVT, NewLoad);
+  return DAG.getMergeValues({Value, NewLoad.getValue(1)}, DL);
+}
+
 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
 /// by the chain and intrinsic ID. Theoretically we would also need to check the
 /// specific intrinsic, but they all place the pointer operand first.
@@ -18567,6 +18653,15 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
 
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into AMDGPUISD::REG_{LOAD,
+  // STORE}. This is scoped to addrspace(13) memory nodes, so it never perturbs
+  // ordinary memory operations.
+  unsigned Opc = N->getOpcode();
+  if ((Opc == ISD::LOAD || Opc == ISD::STORE) &&
+      cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::VGPR)
+    if (SDValue V = LowerLoadStoreVGPR(SDValue(N, 0), DCI.DAG))
+      return V;
+
   switch (N->getOpcode()) {
   case ISD::ABS:
     if (SDValue Res = promoteUniformUnaryOpToI32(SDValue(N, 0), DCI))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c98426cdac0b1..37f3bb37d1aef 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -34,16 +34,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   const GCNSubtarget *Subtarget;
 
 public:
-  MVT getRegisterTypeForCallingConv(LLVMContext &Context,
-                                    CallingConv::ID CC,
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                     EVT VT) const override;
   unsigned getNumRegistersForCallingConv(LLVMContext &Context,
                                          CallingConv::ID CC,
                                          EVT VT) const override;
 
-  unsigned getVectorTypeBreakdownForCallingConv(
-    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
-    unsigned &NumIntermediates, MVT &RegisterVT) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
 
   MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
 
@@ -72,8 +73,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
       AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
       AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
       AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
-  SDValue getPreloadedValue(SelectionDAG &DAG,
-                            const SIMachineFunctionInfo &MFI,
+  SDValue getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI,
                             EVT VT,
                             AMDGPUFunctionArgInfo::PreloadedValue) const;
 
@@ -81,8 +81,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                              SelectionDAG &DAG) const override;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
-                                 MVT VT, unsigned Offset) const;
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT,
+                                 unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG, bool WithChain) const;
   SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
@@ -125,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLoadStoreVGPR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const;
@@ -133,8 +134,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
-                              SelectionDAG &DAG, ArrayRef<SDValue> Ops,
+  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG,
+                              ArrayRef<SDValue> Ops,
                               bool IsIntrinsic = false) const;
 
   SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
@@ -151,14 +152,12 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
-  SDValue getFPExtOrFPRound(SelectionDAG &DAG,
-                            SDValue Op,
-                            const SDLoc &DL,
+  SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL,
                             EVT VT) const;
 
-  SDValue convertArgType(
-    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
-    bool Signed, const ISD::InputArg *Arg = nullptr) const;
+  SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
+                         SDValue Val, bool Signed,
+                         const ISD::InputArg *Arg = nullptr) const;
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
@@ -194,13 +193,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
-  SDValue performUCharToFloatCombine(SDNode *N,
-                                     DAGCombinerInfo &DCI) const;
+  SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  SDValue performSHLPtrCombine(SDNode *N,
-                               unsigned AS,
-                               EVT MemVT,
+  SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT,
                                DAGCombinerInfo &DCI) const;
 
   SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
@@ -234,8 +230,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
-  unsigned getFusedOpcode(const SelectionDAG &DAG,
-                          const SDNode *N0, const SDNode *N1) const;
+  unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0,
+                          const SDNode *N1) const;
   SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
                                           DAGCombinerInfo &DCI) const;
@@ -398,7 +394,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   getPreferredVectorAction(MVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                        Type *Ty) const override;
+                                         Type *Ty) const override;
 
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
@@ -417,8 +413,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool supportSplitCSR(MachineFunction *MF) const override;
   void initializeSplitCSR(MachineBasicBlock *Entry) const override;
   void insertCopiesSplitCSR(
-    MachineBasicBlock *Entry,
-    const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+      MachineBasicBlock *Entry,
+      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
@@ -426,8 +422,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  bool CanLowerReturn(CallingConv::ID CallConv,
-                      MachineFunction &MF, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context, const Type *RetTy) const override;
 
@@ -436,13 +432,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
-  void passSpecialInputs(
-    CallLoweringInfo &CLI,
-    CCState &CCInfo,
-    const SIMachineFunctionInfo &Info,
-    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
-    SmallVectorImpl<SDValue> &MemOpChains,
-    SDValue Chain) const;
+  void
+  passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo,
+                    const SIMachineFunctionInfo &Info,
+                    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+                    SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
                           CallingConv::ID CallConv, bool isVarArg,
@@ -454,10 +448,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool mayBeEmittedAsTailCall(const CallInst *) const override;
 
   bool isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -473,7 +467,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
 
-  Register getRegisterByName(const char* RegName, LLT VT,
+  Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
   MachineBasicBlock *splitKillBlock(MachineInstr &MI,
@@ -530,8 +524,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
   bool checkAsmConstraintVal(SDValue Op, StringRef Constraint,
                              uint64_t Val) const;
-  bool checkAsmConstraintValA(SDValue Op,
-                              uint64_t Val,
+  bool checkAsmConstraintValA(SDValue Op, uint64_t Val,
                               unsigned MaxSize = 64) const;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
@@ -542,8 +535,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                      const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
-  void computeKnownBitsForFrameIndex(int FrameIdx,
-                                     KnownBits &Known,
+  void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known,
                                      const MachineFunction &MF) const override;
   void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R,
                                       KnownBits &Known,
@@ -589,8 +581,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicLoad(LoadInst *LI) const override;
   void emitExpandAtomicStore(StoreInst *SI) const override;
 
-  LoadInst *
-  lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+  LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
 
   const TargetRegisterClass *getRegClassFor(MVT VT,
                                             bool isDivergent) const override;
@@ -600,8 +591,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   unsigned
   getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override;
 
-  void allocateHSAUserSGPRs(CCState &CCInfo,
-                            MachineFunction &MF,
+  void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF,
                             const SIRegisterInfo &TRI,
                             SIMachineFunctionInfo &Info) const;
 
@@ -616,28 +606,21 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                            const SIRegisterInfo &TRI,
                            SIMachineFunctionInfo &Info) const;
 
-  void allocateSystemSGPRs(CCState &CCInfo,
-                           MachineFunction &MF,
+  void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
                            SIMachineFunctionInfo &Info,
-                           CallingConv::ID CallConv,
-                           bool IsShader) const;
+                           CallingConv::ID CallConv, bool IsShader) const;
 
-  void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
-                                      MachineFunction &MF,
+  void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF,
                                       const SIRegisterInfo &TRI,
                                       SIMachineFunctionInfo &Info) const;
-  void allocateSpecialInputSGPRs(
-    CCState &CCInfo,
-    MachineFunction &MF,
-    const SIRegisterInfo &TRI,
-    SIMachineFunctionInfo &Info) const;
-
-  void allocateSpecialInputVGPRs(CCState &CCInfo,
-                                 MachineFunction &MF,
+  void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) const;
-  void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
-                                      MachineFunction &MF,
+
+  void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) const;
+  void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF,
                                       const SIRegisterInfo &TRI,
                                       SIMachineFunctionInfo &Info) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c30e53e9b4e4..f106bbdacb957 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -59,6 +59,19 @@ def GFX10Gen         : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.G
 // modifier behavior with dx10_enable.
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
+// "VGPR as memory" (AMDGPUAS::VGPR) whole-dword load/store with a dword index
+// operand into the per-function VGPR file. When the index is a compile-time
+// constant these are selected into register copies via the SI_VGPR_FRAME_*
+// pseudos.
+def SDTRegIdxLoad : SDTypeProfile<1, 1,
+    [SDTCisVT<1, i32>]>; // dword_index
+def SDTRegIdxStore : SDTypeProfile<0, 2,
+    [SDTCisVT<1, i32>]>; // data, dword_index
+def SIreg_load : SDNode<"AMDGPUISD::REG_LOAD", SDTRegIdxLoad,
+                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def SIreg_store : SDNode<"AMDGPUISD::REG_STORE", SDTRegIdxStore,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
 def SDTSBufferLoad : SDTypeProfile<1, 3,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3594caef86782..80a42a66b2368 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,25 +1243,46 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
 }
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
-// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
-// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
-// within the per-function VGPR file. They are produced during instruction
-// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
-// pass before register allocation.
+// "VGPR as memory" pseudo accesses: a load/store of a whole VGPR tuple (one or
+// more dwords) from/to an alloca in the VGPR address space (AMDGPUAS::VGPR), at
+// a constant dword index within the per-function VGPR file. They are selected
+// from AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
+// register copies by the AMDGPUPrivateObjectVGPRs pass before register
+// allocation.
 let hasSideEffects = 0 in {
-def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
-                                        (ins i32imm:$offset)> {
-  let mayLoad = 1;
-  let mayStore = 0;
-}
-
-def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
-                                         (ins VGPR_32:$vdata, i32imm:$offset)> {
-  let mayLoad = 0;
-  let mayStore = 1;
+foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
+              VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
+              VReg_512, VReg_1024] in {
+  def SI_VGPR_FRAME_LOAD_B#rc.Size : VPseudoInstSI <
+      (outs rc:$vdst), (ins i32imm:$idx)> {
+    let mayLoad = 1;
+    let mayStore = 0;
+  }
+  def SI_VGPR_FRAME_STORE_B#rc.Size : VPseudoInstSI <
+      (outs), (ins rc:$vdata, i32imm:$idx)> {
+    let mayLoad = 0;
+    let mayStore = 1;
+  }
 }
 } // End hasSideEffects = 0
 
+// Select AMDGPUISD::REG_{LOAD,STORE} (with a constant dword index) into the
+// width-matched frame pseudo.
+multiclass VGPRFrameLoadStorePat<ValueType vt> {
+  defvar load_inst = !cast<Instruction>("SI_VGPR_FRAME_LOAD_B"#vt.Size);
+  defvar store_inst = !cast<Instruction>("SI_VGPR_FRAME_STORE_B"#vt.Size);
+  def : GCNPat<(vt (SIreg_load (i32 imm:$idx))), (load_inst imm:$idx)>;
+  def : GCNPat<(SIreg_store vt:$data, (i32 imm:$idx)),
+               (store_inst $data, imm:$idx)>;
+}
+
+foreach vt = !listconcat(
+    Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
+    Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
+    Reg288Types.types, Reg320Types.types, Reg352Types.types, Reg384Types.types,
+    Reg512Types.types, Reg1024Types.types) in
+defm : VGPRFrameLoadStorePat<vt>;
+
 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 // needs to be used and an extra instruction to move between VGPR and AGPR.
 // UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
index 63ba44b479279..0a78d119ded18 100644
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -1,12 +1,11 @@
 ; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; and GFX12 and later. On a supported target the object is kept in addrspace(13)
 ; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
 ; scratch.
 
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
 ; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP

>From 36969e8ccac76e5f0089b214cb59f0d73a3cfc33 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 26 Jun 2026 09:02:49 -0500
Subject: [PATCH 3/3] Refactor implementation of VGPR as mem according to
 review comments

---
 clang/include/clang/Basic/AttrDocs.td         |  18 +-
 .../clang/Basic/DiagnosticCommonKinds.td      |   5 -
 clang/lib/CodeGen/CGDecl.cpp                  |  47 +-
 clang/test/CodeGen/target-data.c              |   4 +-
 .../CodeGenHIP/amdgpu-vgpr-O0-warning.hip     |  14 -
 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip      |  19 +
 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip    |  13 +-
 clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl |   2 +-
 llvm/docs/AMDGPUUsage.rst                     |  22 +-
 llvm/include/llvm/Support/AMDGPUAddrSpace.h   |   9 +-
 llvm/lib/IR/AutoUpgrade.cpp                   |   5 +
 llvm/lib/IR/Verifier.cpp                      |   7 +
 llvm/lib/IR/VerifierAMDGPU.cpp                |  39 +-
 llvm/lib/IR/VerifierInternal.h                |   4 +
 llvm/lib/Target/AMDGPU/AMDGPU.h               |  15 +-
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 303 ++++++------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   2 +-
 .../Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp  | 254 ++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   2 +-
 .../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp       | 138 +++---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 233 +--------
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |  15 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  43 +-
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 442 ++++++++++++++++--
 llvm/lib/Target/AMDGPU/SIISelLowering.h       | 114 +++--
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  39 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |  11 +
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  17 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  54 +++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |   9 +
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  12 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  11 -
 llvm/lib/TargetParser/TargetDataLayout.cpp    |   4 +-
 .../AMDGPU/amdgpu-vgpr-allocate-basic.ll      | 109 -----
 .../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll     |  19 -
 .../CodeGen/AMDGPU/as-vgpr-alloca-static.ll   |  58 ---
 llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll  |   4 +-
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |  19 +-
 llvm/test/CodeGen/AMDGPU/nullptr.ll           |   2 +-
 .../CodeGen/AMDGPU/sgpr-regalloc-flags.ll     |   1 +
 .../AMDGPU/vgpr-as-memory-constexpr.ll        |  44 ++
 .../CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll  | 288 ++++++++++++
 .../AMDGPU/vgpr-as-memory-gisel-fallback.ll   |  28 ++
 .../AMDGPU/vgpr-as-memory-lower-module.ll     |  80 ++++
 .../CodeGen/AMDGPU/vgpr-as-memory-subdword.ll |  63 +++
 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll    |  73 +++
 llvm/test/Verifier/AMDGPU/alloca.ll           |  56 +--
 llvm/test/Verifier/AMDGPU/vgpr-memory.ll      |  33 ++
 .../Bitcode/DataLayoutUpgradeTest.cpp         |  38 +-
 50 files changed, 1919 insertions(+), 923 deletions(-)
 delete mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
 create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
 delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
 create mode 100644 llvm/test/Verifier/AMDGPU/vgpr-memory.ll

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index b80265a1aec1d..7439bc23f10d1 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3607,20 +3607,22 @@ An error will be given if:
 def AMDGPUVGPRDocs : Documentation {
   let Category = DocCatAMDGPUAttributes;
   let Content = [{
-This attribute requests that a kernel-local variable be allocated in the
-"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
-so that accesses with statically known indices lower to vector register
-copies instead of scratch memory traffic.
+This attribute requests that a kernel-local variable be placed in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target, so that
+its accesses lower to vector register copies (constant index) or hardware
+register-indexing sequences (dynamic index) instead of scratch memory traffic.
+
+Such a variable is backed by a fixed block of vector registers rather than the
+stack, so - like an LDS/``__shared__`` variable - it is emitted as an internal
+global in ``addrspace(13)`` with a ``poison`` initializer; its contents are
+undefined until written. This is honored at every optimization level, including
+``-O0``.
 
 Clang supports the ``__attribute__((amdgpu_vgpr))`` or
 ``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
 local variables declared in a ``__global__`` (kernel) function; applying it to
 a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
 error.
-
-Known limitation: the request is only honored with optimizations enabled. At
-``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
-emitted.
   }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index fe03be43c80c7..f2ed2f4698b8d 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,11 +319,6 @@ def warn_stack_protection_ignore_attribute : Warning<
   "'stack_protector_ignore' attribute ignored due to "
   "'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
 
-def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
-  "%0 is not guaranteed to keep the variable in vector registers at -O0; "
-  "it may fall back to scratch memory">,
-  InGroup<DiagGroup<"amdgpu-vgpr">>;
-
 def warn_slh_does_not_support_asm_goto : Warning<
   "speculative load hardening does not protect functions with asm goto">,
   InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bca2d11d47c6a..471746ee8522a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1603,30 +1603,23 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
       // building the instruction so that it's there even in no-asserts
       // builds.
       //
-      // "VGPR as memory" objects keep their backing registers only once the
-      // optimizing register allocator runs. At -O0 the backend cannot lower
-      // these accesses (e.g. when the address escapes a basic block), so the
-      // request is not honored: fall back to an ordinary (scratch) alloca and
-      // warn, matching the documented behavior.
-      // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
-      // backing tuple to scratch) so this fallback can be removed.
+      // A "VGPR as memory" object (amdgpu_vgpr) is backed by a fixed block of
+      // vector registers rather than the stack, so - like LDS/__shared__ - it
+      // is emitted as an internal global variable in AMDGPUAS::VGPR. Its
+      // contents are not statically initializable (the backing registers have
+      // no defined initial value), so the initializer is poison.
+      // AMDGPULowerModuleVGPRs lays these out and the backend lowers accesses
+      // to register copies (constant index) or indexed moves (dynamic index).
       const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
-      const bool UseVGPRMemory =
-          VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
-      if (VGPRAttr && !UseVGPRMemory)
-        CGM.getDiags().Report(D.getLocation(),
-                              diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
-            << VGPRAttr;
-
-      if (UseVGPRMemory) {
-        // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
-        // address space so that statically indexed accesses lower to vector
-        // register copies instead of scratch memory.
-        auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
-                                        /*ArraySize=*/nullptr, D.getName(),
-                                        AllocaInsertPt->getIterator());
-        AI->setAlignment(allocaAlignment.getAsAlign());
-        AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+      if (VGPRAttr) {
+        auto *GV = new llvm::GlobalVariable(
+            CGM.getModule(), allocaTy, /*isConstant=*/false,
+            llvm::GlobalValue::InternalLinkage,
+            llvm::PoisonValue::get(allocaTy), getStaticDeclName(CGM, D),
+            /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+            llvm::AMDGPUAS::VGPR);
+        GV->setAlignment(allocaAlignment.getAsAlign());
+        AllocaAddr = RawAddress(GV, allocaTy, allocaAlignment, KnownNonNull);
         address = AllocaAddr;
       } else {
         address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
@@ -1641,10 +1634,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
           D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
 
       // Emit a lifetime intrinsic if meaningful. There's no point in doing this
-      // if we don't have a valid insertion point (?). "VGPR as memory" allocas
-      // live in a non-alloca address space, so the standard lifetime markers
-      // (which assume the alloca address space) are skipped for them.
-      if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
+      // if we don't have a valid insertion point (?). "VGPR as memory" objects
+      // are globals, not allocas, so the standard lifetime markers (which
+      // assume a stack slot) are skipped for them.
+      if (HaveInsertPoint() && !IsMSCatchParam && !VGPRAttr) {
         // If there's a jump into the lifetime of this variable, its lifetime
         // gets broken up into several regions in IR, which requires more work
         // to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index a5e0b814c7042..f03aaba8b53dd 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
deleted file mode 100644
index 4d23008b8ef43..0000000000000
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
-// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
-//
-// At -O0 "VGPR as memory" is not honored: the variable falls back to an
-// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
-
-#define __global__ __attribute__((global))
-
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
-__global__ void kernel(int *out, int i) {
-  int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
-  buf[2] = i;
-  out[0] = buf[2];
-}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
new file mode 100644
index 0000000000000..b8618433055cb
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN:   -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// "VGPR as memory" is honored at every optimization level (it is a global, not
+// an alloca that depends on the optimizing register allocator), so at -O0 the
+// variable is still emitted in addrspace(13) with no diagnostic.
+
+// expected-no-diagnostics
+
+#define __global__ __attribute__((global))
+
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+__global__ void kernel(int *out, int i) {
+  int buf[4] __attribute__((amdgpu_vgpr));
+  buf[2] = i;
+  out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
index 9a5c38e48951c..12a1c24284811 100644
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -4,14 +4,15 @@
 
 #define __global__ __attribute__((global))
 
-// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
-// memory" address space (addrspace(13)), and its accesses stay in that space.
+// A kernel-local variable marked amdgpu_vgpr is emitted as an internal global
+// in the "VGPR as memory" address space (addrspace(13)) with a poison
+// initializer (like an LDS/__shared__ variable), and its accesses stay in that
+// space.
 
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
 // CHECK-LABEL: define {{.*}}@_Z6kernelPii(
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
-// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
-// CHECK: store i32 %{{.*}}, ptr addrspace(13)
-// CHECK: load i32, ptr addrspace(13)
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+// CHECK: load i32, ptr addrspace(13) {{.*}}@{{.*}}buf
 __global__ void kernel(int *out, int i) {
   int buf[4] __attribute__((amdgpu_vgpr));
   buf[2] = i;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..f120db1aaf6cd 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
 
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 void foo(void) {}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8aad903f98561..916dfb9d3b70f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -982,7 +982,7 @@ supported for the ``amdgcn`` target.
      *reserved for future use*             10
      *reserved for future use*             11
      *reserved for downstream use (LLPC)*  12
-     *reserved for future use*             13
+     VGPR as memory                        13              N/A         VGPR             32      0xFFFFFFFF
      *reserved for future use*             14
      *reserved for future use*             16
      Streamout Registers                   128             N/A         GS_REGS
@@ -1092,6 +1092,26 @@ supported for the ``amdgcn`` target.
   When using code object V5 ``LIBOMPTARGET_STACK_SIZE`` may be used to provide the
   private segment size in bytes, for cases where a dynamic stack is used.
 
+**VGPR as memory**
+  The "VGPR as memory" address space holds small objects directly in vector
+  registers instead of scratch (private) memory, avoiding memory traffic for
+  frequently accessed kernel-local data. Objects in this address space are
+  represented as global variables (similar to how *Local* memory uses LDS
+  global variables) and are backed by a block of physical VGPRs that is
+  reserved out of the register allocator for the duration of the function.
+
+  An address in this space is a register-relative dword index into the reserved
+  VGPR block, not a byte address into an addressable memory segment. A load or
+  store at a constant index lowers to a register copy to/from a fixed VGPR; a
+  load or store at a variable (dynamic) index lowers to a hardware register
+  indexing sequence. Sub-dword (8/16-bit) accesses are implemented as
+  read-modify-write of the containing dword.
+
+  Because the address is not a real memory address, ``addrspacecast`` to or from
+  this address space is rejected by the verifier, as is an initializer on such a
+  global variable. The numeric value 13 it uses coincides with the graphics-only
+  ``CONSTANT_BUFFER_5`` alias, which never co-exists with this feature.
+
 **Constant 32-bit**
   *TODO*
 
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index e9d3add54d054..206caf8305c5d 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -96,14 +96,18 @@ namespace AMDGPU {
 enum class FlatAddrSpace : unsigned { FLAT, FlatGlobal, FlatScratch };
 
 inline bool isFlatGlobalAddrSpace(unsigned AS) {
+  // AMDGPUAS::VGPR ("VGPR as memory") is backed by registers, not a
+  // flat-addressable memory segment, so it must not be treated as global even
+  // though its numeric value is greater than MAX_AMDGPU_ADDRESS.
   return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+         AS == AMDGPUAS::CONSTANT_ADDRESS ||
+         (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
 }
 
 inline bool isExtendedGlobalAddrSpace(unsigned AS) {
   return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
          AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
-         AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+         (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
 }
 
 inline bool isConstantAddressSpace(unsigned AS) {
@@ -185,6 +189,7 @@ constexpr int64_t getNullPointerValue(unsigned AS) {
   case PRIVATE_ADDRESS:
   case LOCAL_ADDRESS:
   case REGION_ADDRESS:
+  case VGPR:
     return -1;
   default:
     return 0;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3a823f906b012..c753e9e2bf56a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6851,6 +6851,11 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
         Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
       if (!DL.contains("-p9") && !DL.starts_with("p9"))
         Res.append("-p9:192:256:256:32");
+
+      // Add sizing for address space 13 ("VGPR as memory"), 32-bit
+      // register-relative indices.
+      if (!DL.contains("-p13") && !DL.starts_with("p13"))
+        Res.append("-p13:32:32");
     }
 
     // Upgrade the ELF mangling mode.
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 648446555793b..f8a8f94aed0ca 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -807,6 +807,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
         "Global variable is too large to fit into the address space", &GV,
         GVType);
 
+  // Target-specific global variable checks.
+  verifyAMDGPUGlobalVariable(*this, GV);
+
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
     return;
@@ -3738,6 +3741,10 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
     Check(SrcVTy->getElementCount() ==
               cast<VectorType>(DestTy)->getElementCount(),
           "AddrSpaceCast vector pointer number of elements mismatch", &I);
+
+  // Target-specific addrspacecast checks.
+  verifyAMDGPUAddrSpaceCast(*this, I);
+
   visitInstruction(I);
 }
 
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index de9a0c7bef132..1043f0ddcb311 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
@@ -122,10 +123,40 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
   if (!VS.TT.isAMDGPU())
     return;
 
-  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
-      AI.getAddressSpace() != AMDGPUAS::VGPR)
-    VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
-                   &AI);
+  if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+}
+
+void llvm::verifyAMDGPUGlobalVariable(VerifierSupport &VS,
+                                      const GlobalVariable &GV) {
+  if (!VS.TT.isAMDGPU())
+    return;
+
+  if (GV.getAddressSpace() != AMDGPUAS::VGPR)
+    return;
+
+  // "VGPR as memory" objects are backed by registers, which have no defined
+  // initial contents, so (like LDS) they cannot be statically initialized: the
+  // only permitted initializer is an undef/poison placeholder (isa<UndefValue>
+  // also matches poison).
+  Check(!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer()),
+        "global variable in the VGPR address space (13) cannot have an "
+        "initializer",
+        &GV);
+}
+
+void llvm::verifyAMDGPUAddrSpaceCast(VerifierSupport &VS,
+                                     const AddrSpaceCastInst &I) {
+  if (!VS.TT.isAMDGPU())
+    return;
+
+  // The VGPR address space (13) is register-backed and has no meaningful
+  // numeric address, so it cannot participate in addrspacecast.
+  unsigned SrcAS = I.getSrcAddressSpace();
+  unsigned DestAS = I.getDestAddressSpace();
+  Check(SrcAS != AMDGPUAS::VGPR && DestAS != AMDGPUAS::VGPR,
+        "addrspacecast to or from the VGPR address space (13) is not allowed",
+        &I);
 }
 
 bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/IR/VerifierInternal.h b/llvm/lib/IR/VerifierInternal.h
index 922385230179b..51d4c9eb9af21 100644
--- a/llvm/lib/IR/VerifierInternal.h
+++ b/llvm/lib/IR/VerifierInternal.h
@@ -221,6 +221,10 @@ void verifyAMDGPUFunctionMetadata(VerifierSupport &VS, const Function &F);
 
 void verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI);
 
+void verifyAMDGPUGlobalVariable(VerifierSupport &VS, const GlobalVariable &GV);
+
+void verifyAMDGPUAddrSpaceCast(VerifierSupport &VS, const AddrSpaceCastInst &I);
+
 void verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
                                CallBase &Call);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 3336ea6d1f943..14c67d542b286 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
 extern char &AMDGPUPreloadKernelArgumentsLegacyID;
 
 // Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
+FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
@@ -279,15 +279,12 @@ struct AMDGPUPromoteAllocaPass
 void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
 extern char &AMDGPUPrivateObjectVGPRsID;
 
-// Allocates pre-existing VGPR address space allocas without performing any
-// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
-// objects remain functional.
-struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
-  AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ModulePass *createAMDGPULowerModuleVGPRsPass();
+void initializeAMDGPULowerModuleVGPRsPass(PassRegistry &);
+extern char &AMDGPULowerModuleVGPRsID;
 
-private:
-  TargetMachine &TM;
+struct AMDGPULowerModuleVGPRsPass : PassInfoMixin<AMDGPULowerModuleVGPRsPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
 struct AMDGPUPromoteAllocaToVectorPass
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 66c9353cd5c33..7330f3b13f3cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,10 +21,8 @@
 #include "R600RegisterInfo.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -68,8 +66,7 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
 
   SDValue Srl = In.getOperand(0);
   if (Srl.getOpcode() == ISD::SRL) {
-    if (ConstantSDNode *ShiftAmt =
-            dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+    if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
       if (ShiftAmt->getZExtValue() == 16) {
         Out = stripBitcast(Srl.getOperand(0));
         return true;
@@ -285,20 +282,22 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
 
     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
-    SDValue Ops[] = {LdHi->getChain(), LdHi->getBasePtr(), TiedIn};
+    SDValue Ops[] = {
+      LdHi->getChain(), LdHi->getBasePtr(), TiedIn
+    };
 
     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
     if (LdHi->getMemoryVT() == MVT::i8) {
-      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD
-                   ? AMDGPUISD::LOAD_D16_HI_I8
-                   : AMDGPUISD::LOAD_D16_HI_U8;
+      LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
+        AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
     } else {
       assert(LdHi->getMemoryVT() == MVT::i16);
     }
 
     SDValue NewLoadHi =
-        CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, Ops,
-                                    LdHi->getMemoryVT(), LdHi->getMemOperand());
+      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
+                                  Ops, LdHi->getMemoryVT(),
+                                  LdHi->getMemOperand());
 
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
@@ -317,20 +316,22 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
     if (LdLo->getMemoryVT() == MVT::i8) {
-      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD
-                   ? AMDGPUISD::LOAD_D16_LO_I8
-                   : AMDGPUISD::LOAD_D16_LO_U8;
+      LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
+        AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
     } else {
       assert(LdLo->getMemoryVT() == MVT::i16);
     }
 
     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
 
-    SDValue Ops[] = {LdLo->getChain(), LdLo->getBasePtr(), TiedIn};
+    SDValue Ops[] = {
+      LdLo->getChain(), LdLo->getBasePtr(), TiedIn
+    };
 
     SDValue NewLoadLo =
-        CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, Ops,
-                                    LdLo->getMemoryVT(), LdLo->getMemOperand());
+      CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
+                                  Ops, LdLo->getMemoryVT(),
+                                  LdLo->getMemOperand());
 
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
@@ -341,29 +342,31 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
 }
 
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
-  bool MadeChange = false;
+  if (!Subtarget->d16PreservesUnusedBits())
+    return;
 
-  if (Subtarget->d16PreservesUnusedBits()) {
-    SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
-    while (Position != CurDAG->allnodes_begin()) {
-      SDNode *N = &*--Position;
-      if (N->use_empty())
-        continue;
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
-      switch (N->getOpcode()) {
-      case ISD::BUILD_VECTOR:
-        // TODO: Match load d16 from shl (extload:i16), 16
-        MadeChange |= matchLoadD16FromBuildVector(N);
-        break;
-      default:
-        break;
-      }
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    if (N->use_empty())
+      continue;
+
+    switch (N->getOpcode()) {
+    case ISD::BUILD_VECTOR:
+      // TODO: Match load d16 from shl (extload:i16), 16
+      MadeChange |= matchLoadD16FromBuildVector(N);
+      break;
+    default:
+      break;
     }
   }
 
   if (MadeChange) {
     CurDAG->RemoveDeadNodes();
-    LLVM_DEBUG(dbgs() << "After PreProcess:\n"; CurDAG->dump(););
+    LLVM_DEBUG(dbgs() << "After PreProcess:\n";
+               CurDAG->dump(););
   }
 }
 
@@ -385,8 +388,8 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
-const TargetRegisterClass *
-AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
   if (!N->isMachineOpcode()) {
     if (N->getOpcode() == ISD::CopyToReg) {
       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
@@ -424,14 +427,14 @@ AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
     SDValue SubRegOp = N->getOperand(OpNo + 1);
     unsigned SubRegIdx = SubRegOp->getAsZExtVal();
     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
-                                                               SubRegIdx);
+                                                              SubRegIdx);
   }
   }
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
                                          SDValue Glue) const {
-  SmallVector<SDValue, 8> Ops;
+  SmallVector <SDValue, 8> Ops;
   Ops.push_back(NewChain); // Replace the chain.
   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
     Ops.push_back(N->getOperand(i));
@@ -441,8 +444,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
 }
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
-  const SITargetLowering &Lowering =
-      *static_cast<const SITargetLowering *>(getTargetLowering());
+  const SITargetLowering& Lowering =
+    *static_cast<const SITargetLowering*>(getTargetLowering());
 
   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
 
@@ -459,8 +462,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
     MachineFunction &MF = CurDAG->getMachineFunction();
     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
-    return glueCopyToM0(N,
-                        CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+    return
+        glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
   }
   return N;
 }
@@ -542,7 +545,7 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   }
 
   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
-                                "supported yet");
+                                  "supported yet");
   // 32 = Max Num Vector Elements
   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
   // 1 = Vector Register Class
@@ -568,8 +571,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   if (NOps != NumVectorElts) {
     // Fill in the missing undef elements if this was a scalar_to_vector.
     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
-    MachineSDNode *ImpDef =
-        CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT);
+    MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                   DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
                                  i * EltSizeInRegs, EltSizeInRegs)
@@ -678,7 +681,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return; // Already selected.
+    return;   // Already selected.
   }
 
   // isa<MemSDNode> almost works but is slightly too permissive for some DS
@@ -766,8 +769,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     } else {
       llvm_unreachable("Unhandled value type for BUILD_PAIR");
     }
-    const SDValue Ops[] = {RC, N->getOperand(0), SubReg0, N->getOperand(1),
-                           SubReg1};
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
                                           N->getValueType(0), Ops));
     return;
@@ -818,8 +821,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     uint32_t OffsetVal = Offset->getZExtValue();
     uint32_t WidthVal = Width->getZExtValue();
 
-    ReplaceNode(
-        N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+    ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
+                            WidthVal));
     return;
   }
   case AMDGPUISD::DIV_SCALE: {
@@ -835,8 +838,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   case ISD::UMUL_LOHI:
     return SelectMUL_LOHI(N);
   case ISD::CopyToReg: {
-    const SITargetLowering &Lowering =
-        *static_cast<const SITargetLowering *>(getTargetLowering());
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
     break;
   }
@@ -864,7 +867,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (N->getValueType(0) == MVT::i32) {
       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
-                              {N->getOperand(0), N->getOperand(1)});
+                              { N->getOperand(0), N->getOperand(1) });
       SelectCode(N);
       return;
     }
@@ -1027,7 +1030,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
     Base = Addr.getOperand(0);
     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
   } else {
@@ -1040,9 +1043,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
 
 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
                                                        const SDLoc &DL) const {
-  SDNode *Mov =
-      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
-                             CurDAG->getTargetConstant(Val, DL, MVT::i32));
+  SDNode *Mov = CurDAG->getMachineNode(
+    AMDGPU::S_MOV_B32, DL, MVT::i32,
+    CurDAG->getTargetConstant(Val, DL, MVT::i32));
   return SDValue(Mov, 0);
 }
 
@@ -1183,8 +1186,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
-  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp,
-  //  omod
+  //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
   SDValue Ops[10];
 
   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -1222,8 +1224,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
 
   assert(VT == MVT::f32 || VT == MVT::f64);
 
-  unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64
-                                  : AMDGPU::V_DIV_SCALE_F32_e64;
+  unsigned Opc
+    = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
 
   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
   // omod
@@ -1250,7 +1252,8 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
     Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
 
   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
-  SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp};
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                    Clamp };
 
   if (UseNoCarry) {
     MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
@@ -1329,8 +1332,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
     }
   } else if (Addr.getOpcode() == ISD::SUB) {
     // sub C, x -> add (sub 0, x), C
-    if (const ConstantSDNode *C =
-            dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
       int64_t ByteOffset = C->getSExtValue();
       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1338,8 +1340,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
         // XXX - This is kind of hacky. Create a dummy sub node so we can check
         // the known bits in isDSOffsetLegal. We need to emit the selected node
         // here, so this is thrown away.
-        SDValue Sub =
-            CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
+        SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+                                      Zero, Addr.getOperand(1));
 
         if (isDSOffsetLegal(Sub, ByteOffset)) {
           SmallVector<SDValue, 3> Opnds;
@@ -1373,8 +1375,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
 
     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
-      MachineSDNode *MovZero =
-          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
+      MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                 DL, MVT::i32, Zero);
       Base = SDValue(MovZero, 0);
       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
       return true;
@@ -1679,8 +1681,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   if (C->getSExtValue()) {
     SDLoc DL(Addr);
 
-    const SITargetLowering &Lowering =
-        *static_cast<const SITargetLowering *>(getTargetLowering());
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
 
     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
     return true;
@@ -1689,8 +1691,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return false;
 }
 
-std::pair<SDValue, SDValue>
-AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   SDLoc DL(N);
 
   auto *FI = dyn_cast<FrameIndexSDNode>(N);
@@ -1704,9 +1705,9 @@ AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
-                                                 SDValue &Rsrc, SDValue &VAddr,
-                                                 SDValue &SOffset,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
+                                                 SDValue Addr, SDValue &Rsrc,
+                                                 SDValue &VAddr, SDValue &SOffset,
                                                  SDValue &ImmOffset) const {
 
   SDLoc DL(Addr);
@@ -1724,8 +1725,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
       const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
       SDValue HighBits =
           CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
-      MachineSDNode *MovHighBits =
-          CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+      MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
       VAddr = SDValue(MovHighBits, 0);
 
       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1781,7 +1782,8 @@ static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
   return RC && TRI.isSGPRClass(RC);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
+                                                  SDValue Addr,
                                                   SDValue &SRsrc,
                                                   SDValue &SOffset,
                                                   SDValue &Offset) const {
@@ -1824,8 +1826,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
-                                           SDValue &SOffset,
-                                           SDValue &Offset) const {
+                                           SDValue &SOffset, SDValue &Offset
+                                           ) const {
   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
   const SIInstrInfo *TII = Subtarget->getInstrInfo();
 
@@ -1839,8 +1841,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
                     maskTrailingOnes<uint64_t>(32); // Size
     SDLoc DL(Addr);
 
-    const SITargetLowering &Lowering =
-        *static_cast<const SITargetLowering *>(getTargetLowering());
+    const SITargetLowering& Lowering =
+      *static_cast<const SITargetLowering*>(getTargetLowering());
 
     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
     return true;
@@ -1861,14 +1863,14 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
 
 // Find a load or store from corresponding pattern root.
 // Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode *findMemSDNode(SDNode *N) {
-  N = AMDGPUTargetLowering::stripBitcast(SDValue(N, 0)).getNode();
+static MemSDNode* findMemSDNode(SDNode *N) {
+  N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
     return MN;
   assert(isa<BuildVectorSDNode>(N));
   for (SDValue V : N->op_values())
     if (MemSDNode *MN =
-            dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+          dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
       return MN;
   llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
@@ -2249,8 +2251,8 @@ static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
     // Materialize this into a scalar move for scalar address to avoid
     // readfirstlane.
     auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
-    SDValue TFI =
-        CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+                                              FI->getValueType(0));
     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
                                            MVT::i32, TFI, SAddr.getOperand(1)),
                     0);
@@ -2350,8 +2352,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
 
       if (isUInt<32>(RemainderOffset)) {
         SDNode *VMov = CurDAG->getMachineNode(
-            AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
-            CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+          AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+          CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
         if (!isFlatScratchBaseLegal(Addr))
@@ -2539,16 +2541,16 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
 
   const SDValue Ops[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
-      Addr,
-      CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
-              0),
-      CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+    CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+    Addr,
+    CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+    SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+            0),
+    CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
   };
 
-  return SDValue(
-      CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, Ops), 0);
+  return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+                                        Ops), 0);
 }
 
 // Match a base and an immediate (if Offset is not null) or an SGPR (if
@@ -2684,7 +2686,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
                               /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
-bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base,
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+                                            SDValue &Base,
                                             SDValue &Offset) const {
   SDLoc DL(Index);
 
@@ -2749,7 +2752,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
     if (0 < BVal && BVal <= CVal && CVal < 32) {
       bool Signed = N->getOpcode() == ISD::SRA;
       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
-                              32 - CVal));
+                  32 - CVal));
       return;
     }
   }
@@ -2794,7 +2797,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
         if (isMask_32(MaskVal)) {
           uint32_t WidthVal = llvm::popcount(MaskVal);
           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
-                                  WidthVal));
+                      WidthVal));
           return;
         }
       }
@@ -2889,8 +2892,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
   SDValue Cond = N->getOperand(1);
 
   if (Cond.isUndef()) {
-    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, N->getOperand(2),
-                         N->getOperand(0));
+    CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
+                         N->getOperand(2), N->getOperand(0));
     return;
   }
 
@@ -2990,8 +2993,8 @@ void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
   // be copied to an SGPR with readfirstlane.
-  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? AMDGPU::DS_APPEND
-                                                       : AMDGPU::DS_CONSUME;
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
 
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(2);
@@ -3017,8 +3020,10 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
   }
 
   SDValue Ops[] = {
-      Offset, CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), Chain,
-      N->getOperand(N->getNumOperands() - 1) // New glue
+    Offset,
+    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+    Chain,
+    N->getOperand(N->getNumOperands() - 1) // New glue
   };
 
   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -3141,12 +3146,14 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
     // Prefer to do the shift in an SGPR since it should be possible to use m0
     // as the result directly. If it's already an SGPR, it will be eliminated
     // later.
-    SDNode *SGPROffset = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
-                                                MVT::i32, BaseOffset);
+    SDNode *SGPROffset
+      = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+                               BaseOffset);
     // Shift to offset in m0
-    SDNode *M0Base = CurDAG->getMachineNode(
-        AMDGPU::S_LSHL_B32, SL, MVT::i32, SDValue(SGPROffset, 0),
-        CurDAG->getTargetConstant(16, SL, MVT::i32));
+    SDNode *M0Base
+      = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+                               SDValue(SGPROffset, 0),
+                               CurDAG->getTargetConstant(16, SL, MVT::i32));
     glueCopyToM0(N, SDValue(M0Base, 0));
   }
 
@@ -3226,27 +3233,27 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
 
   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
 
-  SDNode *InterpMov = CurDAG->getMachineNode(
-      AMDGPU::V_INTERP_MOV_F32, DL, VTs,
-      {
-          CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
-          N->getOperand(3),                           // Attr
-          N->getOperand(2),                           // Attrchan
-          ToM0.getValue(1)                            // In glue
-      });
-
-  SDNode *InterpP1LV = CurDAG->getMachineNode(
-      AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32,
-      {CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
-       N->getOperand(1),                           // Src0
-       N->getOperand(3),                           // Attr
-       N->getOperand(2),                           // Attrchan
-       CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
-       SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
-       N->getOperand(4),      // high
-       CurDAG->getTargetConstant(0, DL, MVT::i1),  // $clamp
-       CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
-       SDValue(InterpMov, 1)});
+  SDNode *InterpMov =
+    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
+        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+        N->getOperand(3),  // Attr
+        N->getOperand(2),  // Attrchan
+        ToM0.getValue(1) // In glue
+  });
+
+  SDNode *InterpP1LV =
+    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+        N->getOperand(1), // Src0
+        N->getOperand(3), // Attr
+        N->getOperand(2), // Attrchan
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+        N->getOperand(4), // high
+        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+        SDValue(InterpMov, 1)
+  });
 
   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
 }
@@ -3373,8 +3380,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
 }
 
 void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
-  SDValue Log2WaveSize = CurDAG->getTargetConstant(
-      Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
+  SDValue Log2WaveSize =
+    CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
   CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
                        {N->getOperand(0), Log2WaveSize});
 }
@@ -3647,14 +3654,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
 
     if (Lo.getValueSizeInBits() > VecSize) {
       Lo = CurDAG->getTargetExtractSubreg(
-          (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
-          MVT::getIntegerVT(VecSize), Lo);
+        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+        MVT::getIntegerVT(VecSize), Lo);
     }
 
     if (Hi.getValueSizeInBits() > VecSize) {
       Hi = CurDAG->getTargetExtractSubreg(
-          (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
-          MVT::getIntegerVT(VecSize), Hi);
+        (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+        MVT::getIntegerVT(VecSize), Hi);
     }
 
     assert(Lo.getValueSizeInBits() <= VecSize &&
@@ -3694,18 +3701,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
                 TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};
 
         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
-                                             Src.getValueType(), Ops),
-                      0);
+                                             Src.getValueType(), Ops), 0);
       }
       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
       return true;
     }
 
     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
-      uint64_t Lit = cast<ConstantFPSDNode>(Lo)
-                         ->getValueAPF()
-                         .bitcastToAPInt()
-                         .getZExtValue();
+      uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+                      .bitcastToAPInt().getZExtValue();
       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
@@ -4387,7 +4391,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     //                          1     0     1
     //                          1     1     0
     //                          1     1     1
-    const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
+    const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
 
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if (C->isAllOnes()) {
@@ -4448,7 +4452,8 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
     SDValue RHS = In.getOperand(1);
 
     SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
-    if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
+    if (!getOperandBits(LHS, LHSBits) ||
+        !getOperandBits(RHS, RHSBits)) {
       Src = std::move(Backup);
       return std::make_pair(0, 0);
     }
@@ -4642,7 +4647,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
     SDLoc SL(In);
     return CurDAG->getConstant(
-        C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+      C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
   }
 
   SDValue Src;
@@ -4652,7 +4657,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
   return SDValue();
 }
 
-bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
   assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
 
   const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
@@ -4661,7 +4666,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
   unsigned Limit = 0;
   bool AllUsesAcceptSReg = true;
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-       Limit < 10 && U != E; ++U, ++Limit) {
+    Limit < 10 && U != E; ++U, ++Limit) {
     const TargetRegisterClass *RC =
         getOperandRegClass(U->getUser(), U->getOperandNo());
 
@@ -4731,8 +4736,8 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
 }
 
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-  const AMDGPUTargetLowering &Lowering =
-      *static_cast<const AMDGPUTargetLowering *>(getTargetLowering());
+  const AMDGPUTargetLowering& Lowering =
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a06c15594bf0a..95f85a6151375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -273,7 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                                  SDValue &SrcMods) const;
 
   bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
-                    SDValue &Tbl) const;
+                   SDValue &Tbl) const;
 
   SDValue getHi16Elt(SDValue In) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
new file mode 100644
index 0000000000000..5e4a0914a4366
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
@@ -0,0 +1,254 @@
+//===- AMDGPULowerModuleVGPRs.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lays out the "VGPR as memory" (addrspace(13)) globals of a module into a
+// single shared register "file" and records, on every function that takes part
+// in a call graph which uses the file, where that file lives.
+//
+// Unlike ordinary memory, the file is backed by a fixed block of physical
+// VGPRs. For an address into the file to be meaningful across a call (i.e. for
+// it to be passed between functions), every function in the call graph must
+// agree on (a) the byte offset of each global within the file and (b) the
+// physical register the file starts at. Computing these per function (as the
+// backend does on its own) does not satisfy (b), because the natural low base
+// sits just above each function's ABI input registers, which differ. This pass
+// resolves both module-wide:
+//
+//  * Offsets: all addrspace(13) globals are packed into one layout, in a
+//    deterministic order, and each global's byte offset is recorded as
+//    "amdgpu.vgpr.memory.offset" metadata.
+//
+//  * Base: a single base register index, chosen as the maximum ABI-input VGPR
+//    boundary over all participating functions, so it is at or above every
+//    function's own inputs and is therefore identical everywhere. It is kept as
+//    low as that maximum allows so occupancy is preserved.
+//
+// Both the total file size and the shared base are attached as the
+// "amdgpu-vgpr-memory-size" and "amdgpu-vgpr-memory-base" function attributes
+// to every function whose call graph uses the file (the file behaves like LDS:
+// it is live for a using kernel's entire execution, so all reachable functions
+// must reserve it). The backend consumes these:
+//   - SIISelLowering reads the per-global offset metadata.
+//   - SIMachineFunctionInfo reads the size/base attributes.
+//   - SIRegisterInfo::getVGPRMemoryFile reserves [base, base + size).
+//
+// TODO: A single module-wide layout means every using function reserves all
+// addrspace(13) globals, and a function reachable from several kernels reserves
+// the file even when called from a kernel that does not use it. A per-kernel
+// layout (as AMDGPULowerModuleLDS does, with a table for shared callees) would
+// tighten this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-module-vgprs"
+
+namespace {
+
+constexpr char SizeAttr[] = "amdgpu-vgpr-memory-size";
+constexpr char BaseAttr[] = "amdgpu-vgpr-memory-base";
+constexpr char OffsetMD[] = "amdgpu.vgpr.memory.offset";
+
+// Upper bound on the number of VGPRs occupied by a function's ABI inputs (the
+// registers the shared file must sit above).
+static unsigned inputVGPRBound(const Function &F) {
+  // Compute kernels take their arguments in the kernarg segment (SGPRs/memory),
+  // not VGPRs. Their only VGPR input is the workitem ID, which AMDGPU packs
+  // into a single register (v0), so it occupies at most one VGPR regardless of
+  // how many dimensions are used.
+  if (AMDGPU::isKernel(F.getCallingConv())) {
+    bool UsesWorkitemID = !F.hasFnAttribute("amdgpu-no-workitem-id-x") ||
+                          !F.hasFnAttribute("amdgpu-no-workitem-id-y") ||
+                          !F.hasFnAttribute("amdgpu-no-workitem-id-z");
+    return UsesWorkitemID ? 1 : 0;
+  }
+
+  // Graphics entry points and ordinary functions pass their arguments in VGPRs
+  // (except inreg arguments, which go in SGPRs).
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  unsigned N = 0;
+  for (const Argument &A : F.args()) {
+    if (A.hasAttribute(Attribute::InReg))
+      continue;
+    N += divideCeil(DL.getTypeAllocSize(A.getType()).getFixedValue(), 4u);
+  }
+  return N;
+}
+
+class AMDGPULowerModuleVGPRs : public ModulePass {
+public:
+  static char ID;
+  AMDGPULowerModuleVGPRs() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override { return "AMDGPU Lower Module VGPRs"; }
+};
+
+} // end anonymous namespace
+
+char AMDGPULowerModuleVGPRs::ID = 0;
+char &llvm::AMDGPULowerModuleVGPRsID = AMDGPULowerModuleVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleVGPRs, DEBUG_TYPE, "AMDGPU Lower Module VGPRs",
+                false, false)
+
+ModulePass *llvm::createAMDGPULowerModuleVGPRsPass() {
+  return new AMDGPULowerModuleVGPRs();
+}
+
+static bool lowerModuleVGPRs(Module &M) {
+  // Collect the addrspace(13) globals.
+  SmallVector<GlobalVariable *, 8> Globals;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::VGPR)
+      Globals.push_back(&GV);
+  if (Globals.empty())
+    return false;
+
+  // Map each function to the addrspace(13) globals it directly references.
+  DenseMap<Function *, SmallVector<GlobalVariable *, 2>> Uses;
+  for (Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    SmallPtrSet<GlobalVariable *, 4> Seen;
+    for (Instruction &I : instructions(F))
+      for (Value *Op : I.operands()) {
+        // getUnderlyingObject sees through (constant-expression) GEPs and
+        // casts, so a global referenced via e.g. `getelementptr(@g, off)` is
+        // found.
+        auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Op));
+        if (GV && GV->getAddressSpace() == AMDGPUAS::VGPR &&
+            Seen.insert(GV).second)
+          Uses[&F].push_back(GV);
+      }
+  }
+  if (Uses.empty())
+    return true; // nothing references the file
+
+  CallGraph CG(M);
+  auto Reachable = [&](Function *Root, SmallPtrSetImpl<Function *> &Out) {
+    SmallVector<Function *, 16> Work{Root};
+    while (!Work.empty()) {
+      Function *F = Work.pop_back_val();
+      if (!Out.insert(F).second)
+        continue;
+      if (CallGraphNode *N = CG[F])
+        for (auto &CR : *N)
+          if (Function *Callee = CR.second->getFunction())
+            if (!Callee->isDeclaration())
+              Work.push_back(Callee);
+    }
+  };
+
+  // Partition functions and globals into independent layout groups. The file is
+  // live for a using kernel's whole execution (like LDS), so a group must cover
+  // everything reachable from such a kernel; and a global must share a layout
+  // with every function that uses it. Disjoint kernels therefore land in
+  // separate groups and get independent (low, occupancy-friendly) bases, while
+  // data- or call-graph-shared functions stay in one consistent group.
+  //
+  // Functions and globals are both GlobalValues, so one union-find covers both.
+  EquivalenceClasses<const GlobalValue *> Groups;
+  for (auto &[F, GVs] : Uses)
+    for (GlobalVariable *GV : GVs)
+      Groups.unionSets(F, GV);
+
+  // Functions reachable from each file-using kernel join that kernel's group
+  // (so they reserve the file), and kernels sharing any callee merge.
+  for (Function &K : M) {
+    if (K.isDeclaration() || !AMDGPU::isEntryFunctionCC(K.getCallingConv()))
+      continue;
+    SmallPtrSet<Function *, 16> R;
+    Reachable(&K, R);
+    if (llvm::none_of(R, [&](Function *F) { return Uses.count(F); }))
+      continue; // this kernel does not use the file
+    for (Function *F : R)
+      Groups.unionSets(&K, F);
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+  LLVMContext &Ctx = M.getContext();
+  Type *I32 = Type::getInt32Ty(Ctx);
+
+  // Lay out each group independently.
+  for (auto It = Groups.begin(), E = Groups.end(); It != E; ++It) {
+    const auto *Leader = *It;
+    if (!Leader->isLeader())
+      continue;
+    SmallVector<GlobalVariable *, 8> GroupGlobals;
+    SmallVector<Function *, 16> GroupFns;
+    for (auto MI = Groups.member_begin(*Leader); MI != Groups.member_end();
+         ++MI) {
+      const GlobalValue *GV = *MI;
+      if (auto *G = dyn_cast<GlobalVariable>(GV))
+        GroupGlobals.push_back(const_cast<GlobalVariable *>(G));
+      else
+        GroupFns.push_back(const_cast<Function *>(cast<Function>(GV)));
+    }
+    if (GroupGlobals.empty() || GroupFns.empty())
+      continue;
+
+    // Deterministic packed layout (sorted by name).
+    llvm::stable_sort(GroupGlobals, [](GlobalVariable *A, GlobalVariable *B) {
+      return A->getName() < B->getName();
+    });
+    unsigned Size = 0;
+    for (GlobalVariable *GV : GroupGlobals) {
+      Align A = std::max(
+          DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()),
+          Align(4));
+      unsigned Offset = alignTo(Size, A);
+      GV->setMetadata(OffsetMD,
+                      MDNode::get(Ctx, {ConstantAsMetadata::get(
+                                           ConstantInt::get(I32, Offset))}));
+      Size = Offset + DL.getTypeAllocSize(GV->getValueType()).getFixedValue();
+    }
+
+    // One base for the group: above every member's ABI inputs, even-aligned.
+    unsigned Base = 0;
+    for (Function *F : GroupFns)
+      Base = std::max(Base, inputVGPRBound(*F));
+    Base = alignTo(Base, 2u);
+
+    for (Function *F : GroupFns) {
+      F->addFnAttr(SizeAttr, utostr(Size));
+      F->addFnAttr(BaseAttr, utostr(Base));
+    }
+  }
+  return true;
+}
+
+bool AMDGPULowerModuleVGPRs::runOnModule(Module &M) {
+  return lowerModuleVGPRs(M);
+}
+
+PreservedAnalyses AMDGPULowerModuleVGPRsPass::run(Module &M,
+                                                  ModuleAnalysisManager &) {
+  return lowerModuleVGPRs(M) ? PreservedAnalyses::none()
+                             : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b377704c2f296..028406085fd7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -23,6 +23,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
 MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-lower-module-vgprs", AMDGPULowerModuleVGPRsPass())
 MODULE_PASS("amdgpu-perf-hint",
             AMDGPUPerfHintAnalysisPass(
               *static_cast<const GCNTargetMachine *>(this)))
@@ -67,7 +68,6 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
-FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index d8ff923619193..7dbc3e4f79690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -8,29 +8,33 @@
 //
 /// \file
 /// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
-/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
-/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
-/// carries a constant byte offset, which selects the dword (subregister) to
-/// copy.
+/// objects (AMDGPUAS::VGPR / addrspace(13)) into register copies to/from a
+/// block of physical VGPRs that backs the per-function VGPR file. Each pseudo
+/// carries a constant dword index into the file.
 ///
-/// This runs once the function is out of SSA form (so the single backing tuple
-/// can be defined by several subregister copies) and while LiveIntervals is
-/// available. The backing tuple has lane-divergent liveness (its subregisters
-/// are written and read independently), which the whole-register LiveVariables
-/// analysis cannot represent; the pass therefore updates the subregister-aware
-/// LiveIntervals directly.
+/// A load is simply a COPY from the file register and a store is a COPY to it;
+/// the storage is persistent for the whole function like LDS. The file occupies
+/// a fixed block of physical VGPRs (SIRegisterInfo::getVGPRMemoryFile) that is
+/// reserved out of allocation (SIRegisterInfo::getReservedRegs) and accounted
+/// for in the VGPR count (AMDGPUResourceUsageAnalysis). It sits at the low end
+/// of the VGPR file, just above the ABI inputs, at a base that
+/// AMDGPULowerModuleVGPRs shares across a call graph (so an address resolves to
+/// the same registers in every function), keeping it low enough that the file
+/// costs only its own size rather than pinning occupancy. This pass runs after
+/// register allocation; until then the access pseudos behave as opaque memory
+/// operations, so register allocation is free to use any other register for the
+/// surrounding code.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 
 using namespace llvm;
 
@@ -96,9 +100,6 @@ class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<LiveIntervalsWrapperPass>();
-    AU.addPreserved<LiveIntervalsWrapperPass>();
-    AU.addPreserved<SlotIndexesWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -118,80 +119,49 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  // Collect the pseudos and determine how many dwords the backing tuple needs.
-  // Each pseudo carries a constant dword index and accesses as many dwords as
-  // its data register class is wide.
-  SmallVector<MachineInstr *, 8> Worklist;
-  unsigned NumDwords = 0;
+  // The file occupies a fixed block of physical VGPRs (see
+  // SIRegisterInfo::getVGPRMemoryFile), which is already reserved out of
+  // allocation by getReservedRegs. Because the registers are reserved, they are
+  // exempt from liveness checks and need no explicit definition, and because
+  // the location is fixed (not function-local), references in different
+  // functions of a call graph resolve to the same physical registers.
+  auto [BaseIdx, FileDwords] = TRI->getVGPRMemoryFile(MF);
+  if (FileDwords == 0)
+    return false;
+
+  const TargetRegisterClass &VGPR32 = AMDGPU::VGPR_32RegClass;
+  bool Changed = false;
   for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
+    for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
       unsigned Opc = MI.getOpcode();
-      if (!isVGPRFrameLoad(Opc) && !isVGPRFrameStore(Opc))
+      bool IsLoad = isVGPRFrameLoad(Opc);
+      if (!IsLoad && !isVGPRFrameStore(Opc))
         continue;
-      unsigned Dword = MI.getOperand(1).getImm();
-      unsigned AccessDwords =
-          TRI->getRegSizeInBits(*MRI.getRegClass(MI.getOperand(0).getReg())) /
-          32;
-      NumDwords = std::max(NumDwords, Dword + AccessDwords);
-      Worklist.push_back(&MI);
-    }
-  }
 
-  if (Worklist.empty())
-    return false;
-
-  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-
-  const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
-  assert(RC && "no VGPR register class for VGPR-as-memory object");
-  Register Storage = MRI.createVirtualRegister(RC);
-
-  // Define the whole tuple up front so partial (subregister) writes and reads
-  // of uninitialized lanes are well formed.
-  MachineBasicBlock &Entry = MF.front();
-  MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
-                                 TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
-  LIS->InsertMachineInstrInMaps(*ImpDef);
-
-  for (MachineInstr *MI : Worklist) {
-    MachineBasicBlock &MBB = *MI->getParent();
-    const DebugLoc &DL = MI->getDebugLoc();
-    unsigned Dword = MI->getOperand(1).getImm();
-    unsigned AccessDwords =
-        TRI->getRegSizeInBits(*MRI.getRegClass(MI->getOperand(0).getReg())) /
-        32;
-    unsigned SubReg =
-        (Dword == 0 && AccessDwords == NumDwords)
-            ? AMDGPU::NoSubRegister
-            : SIRegisterInfo::getSubRegFromChannel(Dword, AccessDwords);
-
-    MachineInstr *Copy;
-    if (isVGPRFrameLoad(MI->getOpcode())) {
-      Register Dst = MI->getOperand(0).getReg();
-      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
-                 .addReg(Storage, {}, SubReg);
-    } else {
-      Register Src = MI->getOperand(0).getReg();
-      Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
-                 .addReg(Storage, RegState::Define, SubReg)
-                 .addReg(Src);
+      const DebugLoc &DL = MI.getDebugLoc();
+      unsigned Dword = MI.getOperand(1).getImm();
+      Register Data = MI.getOperand(0).getReg();
+      unsigned AccessDwords = TRI->getRegSizeInBits(Data, MRI) / 32;
+
+      // Physical (sub)register backing this access within the file.
+      MCRegister Phys = VGPR32.getRegister(BaseIdx + Dword);
+      if (AccessDwords != 1) {
+        const TargetRegisterClass *RC =
+            TRI->getVGPRClassForBitWidth(AccessDwords * 32);
+        Phys = TRI->getMatchingSuperReg(Phys, AMDGPU::sub0, RC);
+        assert(Phys &&
+               "no aligned physical VGPR tuple for VGPR-as-memory access");
+      }
+
+      if (IsLoad)
+        BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), Data).addReg(Phys);
+      else
+        BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), Phys).addReg(Data);
+
+      MI.eraseFromParent();
+      Changed = true;
     }
-    // The copy takes the pseudo's slot, so the intervals of the copied
-    // load/store operand stay valid.
-    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
-    MI->eraseFromParent();
   }
 
-  // The backing tuple is brand new; compute its (subregister) live interval.
-  LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
-
-  // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
-  // form disconnected value-number components within the single tuple, which an
-  // individual live interval must not contain. Split them into separate
-  // virtual registers, exactly as the register coalescer does for the intervals
-  // it leaves behind.
-  SmallVector<LiveInterval *, 4> SplitLIs;
-  LIS->splitSeparateComponents(LI, SplitLIs);
-
-  return true;
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 478f54b7cdfc3..95e06dc8295d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -139,7 +138,6 @@ class AMDGPUPromoteAllocaImpl {
   unsigned MaxVGPRs;
   unsigned VGPRBudgetRatio;
   unsigned MaxVectorRegs;
-  unsigned AllocVGPROffset = 0;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -164,10 +162,6 @@ class AMDGPUPromoteAllocaImpl {
   void analyzePromoteToVector(AllocaAnalysis &AA) const;
   void promoteAllocaToVector(AllocaAnalysis &AA);
   void analyzePromoteToLDS(AllocaAnalysis &AA) const;
-
-  /// Allocate an alloca that already lives in the VGPR address space to a range
-  /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
-  void allocateVgprs(AllocaAnalysis &AA);
   bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
                              SetVector<IntrinsicInst *> &DeferredIntrs);
   void
@@ -185,11 +179,7 @@ class AMDGPUPromoteAllocaImpl {
     IsAMDHSA = TT.getOS() == Triple::AMDHSA;
   }
 
-  /// IsLatePass is true when invoked as a codegen pass and false when invoked
-  /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
-  /// requests only the work strictly required for functionality (i.e. VGPR
-  /// allocation), skipping the optimization-oriented promotions.
-  bool run(Function &F, bool IsLatePass, bool NoOpt);
+  bool run(Function &F, bool PromoteToLDS);
 };
 
 // FIXME: This can create globals so should be a module pass.
@@ -197,34 +187,26 @@ class AMDGPUPromoteAlloca : public FunctionPass {
 public:
   static char ID;
 
-  explicit AMDGPUPromoteAlloca(
-      CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
-      : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
+  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
     if (skipFunction(F))
       return false;
-    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
       return AMDGPUPromoteAllocaImpl(
                  TPC->getTM<TargetMachine>(),
                  getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
-          .run(F, /*IsLatePass=*/true, NoOpt);
-    }
+          .run(F, /*PromoteToLDS*/ true);
     return false;
   }
 
-  StringRef getPassName() const override {
-    return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
-  }
+  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
-
-private:
-  bool NoOpt;
 };
 
 static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -271,8 +253,7 @@ char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
 PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
                                                FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
-                                                     /*NoOpt=*/false);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -284,8 +265,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
 PreservedAnalyses
 AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/false,
-                                                     /*NoOpt=*/false);
+  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
   if (Changed) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -294,21 +274,8 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
   return PreservedAnalyses::all();
 }
 
-PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
-                                              FunctionAnalysisManager &AM) {
-  auto &LI = AM.getResult<LoopAnalysis>(F);
-  bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
-                                                     /*NoOpt=*/true);
-  if (Changed) {
-    PreservedAnalyses PA;
-    PA.preserveSet<CFGAnalyses>();
-    return PA;
-  }
-  return PreservedAnalyses::all();
-}
-
-FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
-  return new AMDGPUPromoteAlloca(OptLevel);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+  return new AMDGPUPromoteAlloca();
 }
 
 bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -401,110 +368,9 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
     VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
 }
 
-// A "VGPR as memory" object can only be realized in registers today when every
-// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
-// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses are not yet supported; such objects fall back
-// to scratch instead.
-//
-// TODO: Lower dynamically-indexed / escaping VGPR objects so this fallback is no
-// longer needed.
-static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
-                                            const DataLayout &DL) {
-  // An access is lowerable if it covers a whole number of dwords and starts at
-  // a dword-aligned constant offset from the alloca.
-  auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
-    if (!Simple)
-      return false;
-    uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
-    if (Bits == 0 || Bits % 32 != 0)
-      return false;
-    APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
-    const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
-        DL, Off, /*AllowNonInbounds=*/true);
-    return Base == &AI && Off.urem(4) == 0;
-  };
-
-  SmallVector<const Use *, 16> Worklist;
-  for (const Use &U : AI.uses())
-    Worklist.push_back(&U);
-
-  while (!Worklist.empty()) {
-    const Use *U = Worklist.pop_back_val();
-    const User *Usr = U->getUser();
-
-    if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
-      if (!GEP->hasAllConstantIndices())
-        return false;
-      for (const Use &GU : GEP->uses())
-        Worklist.push_back(&GU);
-      continue;
-    }
-    if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
-      if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
-        return false;
-      continue;
-    }
-    if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
-      // The pointer must be the address operand, not a stored value (escape).
-      if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
-        return false;
-      if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
-                    SI->isSimple()))
-        return false;
-      continue;
-    }
-    // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
-    // otherwise not statically lowerable.
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
+  if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
     return false;
-  }
-  return true;
-}
-
-// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
-// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
-// falls back to ordinary scratch.
-static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
-  SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
-  for (Use *U : Uses) {
-    User *Usr = U->getUser();
-    if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
-      IRBuilder<> B(GEP);
-      SmallVector<Value *, 4> Indices(GEP->indices());
-      Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
-                                  GEP->getName(), GEP->getNoWrapFlags());
-      rewriteVGPRPointerToScratch(GEP, NewGEP);
-      GEP->eraseFromParent();
-      continue;
-    }
-    if (auto *II = dyn_cast<IntrinsicInst>(Usr);
-        II && II->isLifetimeStartOrEnd()) {
-      II->eraseFromParent();
-      continue;
-    }
-    // Loads, stores, address-space casts and call arguments only need this
-    // operand repointed; their result types do not depend on the operand's
-    // address space.
-    U->set(New);
-  }
-}
-
-static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
-  auto *NewAI = new AllocaInst(
-      AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
-      AI->getAlign(), AI->getName(), AI->getIterator());
-  NewAI->setDebugLoc(AI->getDebugLoc());
-  rewriteVGPRPointerToScratch(AI, NewAI);
-  AI->eraseFromParent();
-}
-
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
-  assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
-  if (!IsLatePass && DisablePromoteAllocaToVector)
-    return false;
-
-  bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
-  bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
 
   Mod = F.getParent();
   DL = &Mod->getDataLayout();
@@ -513,12 +379,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
   setFunctionLimits(F);
 
-  // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
-  // GFX12 and later. On any other target the objects fall back to scratch.
-  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
-  const bool TargetSupportsVGPRAsMemory =
-      ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
-
   unsigned VectorizationBudget =
       (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
                                   : (MaxVGPRs * 32)) /
@@ -535,18 +395,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
       LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
 
       AllocaAnalysis AA{AI};
-      if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
-        // Allocas that already live in the VGPR address space only need to be
-        // assigned VGPRs, which is required for functionality.
-        if (IsLatePass)
-          Allocas.push_back(std::move(AA));
-        continue;
-      }
-      if (!PromoteToVector && !PromoteToLDS)
-        continue;
       if (collectAllocaUses(AA)) {
-        if (PromoteToVector)
-          analyzePromoteToVector(AA);
+        analyzePromoteToVector(AA);
         if (PromoteToLDS)
           analyzePromoteToLDS(AA);
         if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -557,15 +407,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
     }
   }
 
-  stable_sort(Allocas, [](const auto &A, const auto &B) {
-    // Prioritize pre-existing VGPR allocas, since their allocation must not
-    // fail.
-    bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
-    bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
-    if (AIsVGPR != BIsVGPR)
-      return AIsVGPR;
-    return A.Score > B.Score;
-  });
+  stable_sort(Allocas,
+              [](const auto &A, const auto &B) { return A.Score > B.Score; });
 
   // clang-format off
   LLVM_DEBUG(
@@ -578,39 +421,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   bool Changed = false;
   SetVector<IntrinsicInst *> DeferredIntrs;
   for (AllocaAnalysis &AA : Allocas) {
-    if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
-      // Fall back to scratch (and warn) when the object can't be kept in
-      // registers, so the program still compiles correctly: either the target
-      // does not support "VGPR as memory", or the access pattern (dynamic
-      // index, sub-dword, escaping address) is not yet supported.
-      const char *Unsupported = nullptr;
-      if (!TargetSupportsVGPRAsMemory)
-        Unsupported = "not supported on this target";
-      else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
-        Unsupported = "dynamic indexing, sub-dword access, or escaping address "
-                      "is not yet supported";
-      if (Unsupported) {
-        F.getContext().diagnose(DiagnosticInfoUnsupported(
-            F,
-            Twine("'amdgpu_vgpr' object could not be kept in vector registers "
-                  "(") +
-                Unsupported + "); using scratch memory instead",
-            AA.Alloca->getDebugLoc(), DS_Warning));
-        demoteVGPRAllocaToScratch(AA.Alloca);
-        Changed = true;
-        continue;
-      }
-      const unsigned AllocaCost =
-          AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
-      allocateVgprs(AA);
-      // Account for the consumed VGPRs in the vectorization budget.
-      if (VectorizationBudget > AllocaCost)
-        VectorizationBudget -= AllocaCost;
-      else
-        VectorizationBudget = 0;
-      Changed = true;
-      continue;
-    }
     if (AA.Vector.Ty) {
       std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
       assert(Size); // Expected to succeed on non-array alloca.
@@ -645,21 +455,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
   return Changed;
 }
 
-void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
-  LLVMContext &Ctx = Mod->getContext();
-  const unsigned AllocaSize =
-      DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
-
-  // Record where the object was allocated within the VGPR file.
-  Type *I32 = Type::getInt32Ty(Ctx);
-  AA.Alloca->setMetadata(
-      "amdgpu.allocated.vgprs",
-      MDNode::get(
-          Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
-                ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
-  AllocVGPROffset += alignTo(AllocaSize, 4);
-}
-
 // Checks if the instruction I is a memset user of the alloca AI that we can
 // deal with. Currently, only non-volatile memsets that affect the whole alloca
 // are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index aab43f23cf606..fc97c33a123f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -179,9 +179,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
   // If there are no calls, MachineRegisterInfo can tell us the used register
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
+  // The "VGPR as memory" file occupies reserved physical VGPRs. They are not
+  // counted as "used" registers, but they must still be allocated for the
+  // function, so the VGPR count has to cover the highest one.
+  std::pair<unsigned, unsigned> VGPRMemFile = TRI.getVGPRMemoryFile(MF);
+  unsigned VGPRMemBase = VGPRMemFile.first;
+  unsigned VGPRMemCount = VGPRMemFile.second;
+  auto AccountForVGPRMemoryFile = [&](int32_t NumVGPR) -> int32_t {
+    if (VGPRMemCount)
+      NumVGPR = std::max<int32_t>(NumVGPR, VGPRMemBase + VGPRMemCount);
+    return NumVGPR;
+  };
+
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
     Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
                                           /*IncludeCalls=*/false);
+    Info.NumVGPR = AccountForVGPRMemoryFile(Info.NumVGPR);
     return Info;
   }
 
@@ -319,7 +332,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
     }
   }
 
-  Info.NumVGPR = MaxVGPR + 1;
+  Info.NumVGPR = AccountForVGPRMemoryFile(MaxVGPR + 1);
 
   return Info;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5814862a514b9..6a2b8ffa25c50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -669,6 +669,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIFixSGPRCopiesLegacyPass(*PR);
   initializeSIFixVGPRCopiesLegacyPass(*PR);
   initializeAMDGPUPrivateObjectVGPRsPass(*PR);
+  initializeAMDGPULowerModuleVGPRsPass(*PR);
   initializeSIFoldOperandsLegacyPass(*PR);
   initializeSIPeepholeSDWALegacyPass(*PR);
   initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1492,6 +1493,11 @@ void AMDGPUPassConfig::addIRPasses() {
     addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
   }
 
+  // Lay out "VGPR as memory" (addrspace(13)) globals into one shared register
+  // file and record the size/base on the participating functions, so the file
+  // resolves to the same physical registers across a kernel's call graph.
+  addPass(createAMDGPULowerModuleVGPRsPass());
+
   // Run atomic optimizer before Atomic Expand
   if ((TM.getTargetTriple().isAMDGCN()) &&
       (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
@@ -1501,12 +1507,9 @@ void AMDGPUPassConfig::addIRPasses() {
 
   addPass(createAtomicExpandLegacyPass());
 
-  // With optimizations enabled, do the full promotion of allocas. Without
-  // optimizations, this only allocates pre-existing VGPR address space allocas,
-  // which is required for functionality.
-  addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
-
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    addPass(createAMDGPUPromoteAlloca());
+
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses();
 
@@ -1721,11 +1724,6 @@ void GCNPassConfig::addFastRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
-  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
-  // At O0 there is no register coalescer; anchor on TwoAddress, where
-  // LiveIntervals is already available.
-  insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
-
   insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
 
   TargetPassConfig::addFastRegAlloc();
@@ -1752,12 +1750,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
   // SI_ELSE will introduce a copy of the tied operand source after the else.
   insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
 
-  // Lower "VGPR as memory" accesses to register copies once out of SSA form.
-  // This runs after the coalescer so it does not perturb the kill flags that
-  // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
-  // the LiveIntervals the register allocator consumes next.
-  insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
-
   if (EnableRewritePartialRegUses)
     insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
 
@@ -1909,6 +1901,12 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
+  // Lower "VGPR as memory" accesses into copies to/from a reserved block of
+  // VGPRs placed just above the registers allocated for the rest of the
+  // function. This runs after register allocation so the used-register count is
+  // final, and before memory-aware post-RA passes so the access pseudos are no
+  // longer seen as memory operations.
+  addPass(&AMDGPUPrivateObjectVGPRsID);
   addPass(&SIFixVGPRCopiesID);
   if (getOptLevel() > CodeGenOptLevel::None)
     addPass(&SIOptimizeExecMaskingLegacyID);
@@ -2290,6 +2288,10 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
   if (EnableLowerModuleLDS)
     addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
 
+  // Lay out "VGPR as memory" (addrspace(13)) globals into a shared register
+  // file (see the legacy pipeline above for details).
+  addModulePass(AMDGPULowerModuleVGPRsPass(), PMW);
+
   // Run atomic optimizer before Atomic Expand
   if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
@@ -2298,15 +2300,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
 
   addFunctionPass(AtomicExpandPass(TM), PMW);
 
-  // With optimizations enabled, do the full promotion of allocas. Without
-  // optimizations, only allocate pre-existing VGPR address space allocas, which
-  // is required for functionality.
-  if (TM.getOptLevel() > CodeGenOptLevel::None)
-    addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
-  else
-    addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
-
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
+    addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
     if (isPassEnabled(EnableScalarIRPasses))
       addStraightLineScalarOptimizationPasses(PMW);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dd25ab71997d7..3ca9f5bcc9f9d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
   AMDGPULowerModuleLDSPass.cpp
+  AMDGPULowerModuleVGPRs.cpp
   AMDGPUPrepareAGPRAlloc.cpp
   AMDGPULowerExecSync.cpp
   AMDGPUSwLowerLDS.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index db2301ba28359..86b8c2710e4bd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4227,6 +4227,22 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return true;
 }
 
+bool SITargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+  // GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses, so
+  // fall back to SelectionDAG (which does) for any instruction that produces or
+  // consumes such a pointer. TODO: implement the GlobalISel path.
+  auto IsVGPRPtr = [](const Value *V) {
+    Type *Ty = V->getType();
+    return Ty->isPointerTy() && Ty->getPointerAddressSpace() == AMDGPUAS::VGPR;
+  };
+  if (IsVGPRPtr(&Inst))
+    return true;
+  for (const Value *Op : Inst.operands())
+    if (IsVGPRPtr(Op))
+      return true;
+  return false;
+}
+
 namespace {
 // Chain calls have special arguments that we need to handle. These are
 // tagging along at the end of the arguments list(s), after the SGPR and VGPR
@@ -5228,11 +5244,16 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
       MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
   Register CondReg = MRI.createVirtualRegister(BoolRC);
 
-  BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
-      .addReg(InitReg)
-      .addMBB(&OrigBB)
-      .addReg(ResultReg)
-      .addMBB(&LoopBB);
+  // A zero PhiReg means the caller threads no per-iteration result value
+  // through the loop (e.g. a store whose destination is a fixed physical
+  // register), so the result PHI - and its requirement that ResultReg be
+  // live-out of the loop - is omitted.
+  if (PhiReg)
+    BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+        .addReg(InitReg)
+        .addMBB(&OrigBB)
+        .addReg(ResultReg)
+        .addMBB(&LoopBB);
 
   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
       .addReg(InitSaveExecReg)
@@ -5594,6 +5615,153 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   return LoopBB;
 }
 
+// Expand a "VGPR as memory" access at a runtime dword index into an indirect
+// read/write of the reserved VGPR file, reusing the same movrel / s_set_gpr_idx
+// machinery (and waterfall loop for a divergent index) as indirect vector
+// element access. The file's physical register block stands in for the
+// "vector".
+static MachineBasicBlock *emitVGPRFrameDynamic(MachineInstr &MI,
+                                               MachineBasicBlock &MBB,
+                                               const GCNSubtarget &ST) {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  const bool IsLoad = MI.getOpcode() == AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32;
+
+  // The reserved file block stands in for the indirectly-indexed "vector".
+  auto [BaseIdx, Count] = TRI.getVGPRMemoryFile(*MF);
+  assert(Count && "dynamic VGPR-memory access without a reserved file");
+  const TargetRegisterClass *VecRC = TRI.getVGPRClassForBitWidth(Count * 32);
+  if (!VecRC)
+    report_fatal_error("VGPR-as-memory file too large for a dynamic index");
+  MCRegister FileReg = TRI.getMatchingSuperReg(
+      AMDGPU::VGPR_32RegClass.getRegister(BaseIdx), AMDGPU::sub0, VecRC);
+  // movrel reads name the base sub-register directly (a subregister index is
+  // not allowed on a physical-register operand), with the whole file tuple as
+  // an implicit use.
+  MCRegister FileBaseReg = AMDGPU::VGPR_32RegClass.getRegister(BaseIdx);
+
+  const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+  const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+  const bool UseGPRIdxMode = ST.useVGPRIndexMode();
+
+  // The index is file-relative (the constant part was folded in during ISel),
+  // so the sub-register base is sub0 and the extra offset is zero.
+  unsigned SubReg = AMDGPU::sub0;
+  int Offset = 0;
+
+  MachineBasicBlock::iterator I(&MI);
+
+  // Uniform (scalar) index: emit the access directly.
+  if (TRI.isSGPRClass(IdxRC)) {
+    if (IsLoad) {
+      Register Dst = MI.getOperand(0).getReg();
+      if (UseGPRIdxMode) {
+        Register IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+        BuildMI(
+            MBB, I, DL,
+            TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true),
+            Dst)
+            .addReg(FileReg)
+            .addReg(IdxReg)
+            .addImm(SubReg);
+      } else {
+        setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+            .addReg(FileBaseReg)
+            .addReg(FileReg, RegState::Implicit);
+      }
+    } else {
+      const MachineOperand *Val =
+          TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+      if (UseGPRIdxMode) {
+        Register IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+        BuildMI(
+            MBB, I, DL,
+            TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false),
+            FileReg)
+            .addReg(FileReg)
+            .add(*Val)
+            .addReg(IdxReg)
+            .addImm(SubReg);
+      } else {
+        setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+        BuildMI(MBB, I, DL,
+                TII->getIndirectRegWriteMovRelPseudo(
+                    TRI.getRegSizeInBits(*VecRC), 32, false),
+                FileReg)
+            .addReg(FileReg)
+            .add(*Val)
+            .addImm(SubReg);
+      }
+    }
+    MI.eraseFromParent();
+    return &MBB;
+  }
+
+  // Divergent (per-lane) index: a waterfall loop activates the lanes that share
+  // the just-read index, performs the access for them against the file, and
+  // repeats until every lane is covered. The file lives in fixed (reserved)
+  // physical registers, so unlike indirect vector access it is not threaded
+  // through a PHI; the per-lane access reads/writes it in place under EXEC.
+  // The stored value is re-used on every loop iteration, so it must stay live
+  // across the back-edge.
+  if (!IsLoad)
+    MRI.clearKillFlags(
+        TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg());
+
+  // A load threads the loaded element as the loop result; a store writes the
+  // file in place and threads nothing (PhiReg == 0 skips the result PHI).
+  Register PhiReg, InitReg;
+  if (IsLoad) {
+    PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+  }
+
+  Register SGPRIdxReg;
+  auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+                              UseGPRIdxMode, SGPRIdxReg);
+  MachineBasicBlock *LoopBB = InsPt->getParent();
+  unsigned VecBits = TRI.getRegSizeInBits(*VecRC);
+
+  if (IsLoad) {
+    Register Dst = MI.getOperand(0).getReg();
+    if (UseGPRIdxMode) {
+      BuildMI(*LoopBB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, true),
+              Dst)
+          .addReg(FileReg)
+          .addReg(SGPRIdxReg)
+          .addImm(SubReg);
+    } else {
+      BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+          .addReg(FileBaseReg)
+          .addReg(FileReg, RegState::Implicit);
+    }
+  } else {
+    const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    if (UseGPRIdxMode) {
+      BuildMI(*LoopBB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, false),
+              FileReg)
+          .addReg(FileReg)
+          .add(*Val)
+          .addReg(SGPRIdxReg)
+          .addImm(SubReg);
+    } else {
+      BuildMI(*LoopBB, InsPt, DL,
+              TII->getIndirectRegWriteMovRelPseudo(VecBits, 32, false), FileReg)
+          .addReg(FileReg)
+          .add(*Val)
+          .addImm(SubReg);
+    }
+  }
+
+  MI.eraseFromParent();
+  return LoopBB;
+}
+
 static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,
                                                       MachineBasicBlock *BB) {
   // For targets older than GFX12, we emit a sequence of 32-bit operations.
@@ -6457,7 +6625,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
           else
             ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
         }
-        ClampInstr.addReg(Src0); // src0
+        ClampInstr.addReg(Src0);              // src0
         if (isFPOp)
           ClampInstr.addImm(SISrcMods::NONE); // src1 mod
         ClampInstr.addReg(Src1);              // src1
@@ -7121,6 +7289,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AMDGPU::SI_INDIRECT_DST_V16:
   case AMDGPU::SI_INDIRECT_DST_V32:
     return emitIndirectDst(MI, *BB, *getSubtarget());
+  case AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32:
+  case AMDGPU::SI_VGPR_FRAME_DYN_STORE_B32:
+    return emitVGPRFrameDynamic(MI, *BB, *getSubtarget());
   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
   case AMDGPU::SI_KILL_I1_PSEUDO:
     return splitKillBlock(MI, BB);
@@ -9931,6 +10102,23 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
   EVT PtrVT = Op.getValueType();
 
   const GlobalValue *GV = GSD->getGlobal();
+
+  // A "VGPR as memory" (addrspace(13)) global has no numeric memory address;
+  // its "address" is the byte offset of the object within the reserved register
+  // file (assigned by AMDGPULowerModuleVGPRs and recorded as metadata). Lower
+  // it to that constant offset so that, even when the address is materialized
+  // standalone (e.g. from a constant-expression GEP), it never reaches the
+  // pc-relative global-address sequence. SIISelLowering's REG_{LOAD,STORE}
+  // folding then turns the access into a register copy/indexed move.
+  if (GSD->getAddressSpace() == AMDGPUAS::VGPR) {
+    uint64_t Offset = GSD->getOffset();
+    if (const auto *GVar = dyn_cast<GlobalVariable>(GV))
+      if (MDNode *MD = GVar->getMetadata("amdgpu.vgpr.memory.offset"))
+        Offset +=
+            mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+    return DAG.getConstant(Offset, DL, PtrVT);
+  }
+
   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
        shouldUseLDSConstAddress(GV)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
@@ -12432,18 +12620,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     case 12:
       if (!Subtarget->hasLDSLoadB96_B128())
         return SDValue();
-      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
-                                      : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
-            : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
-                         : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
       break;
     case 16:
       if (!Subtarget->hasLDSLoadB96_B128())
         return SDValue();
-      Opc = HasVIndex    ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
-                                      : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
-            : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
-                         : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+      Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+                      : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+                                   : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
       break;
     }
 
@@ -12473,11 +12661,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
             ? 1
             : 0,
-        DL, MVT::i8)); // swz
+        DL, MVT::i8));                                           // swz
     Ops.push_back(
         DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
-    Ops.push_back(M0Val.getValue(0)); // Chain
-    Ops.push_back(M0Val.getValue(1)); // Glue
+    Ops.push_back(M0Val.getValue(0));                            // Chain
+    Ops.push_back(M0Val.getValue(1));                            // Glue
 
     auto *M = cast<MemSDNode>(Op);
     auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
@@ -12555,7 +12743,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       Ops.push_back(VOffset);
     }
 
-    Ops.push_back(Op.getOperand(5)); // Offset
+    Ops.push_back(Op.getOperand(5));  // Offset
 
     unsigned Aux = Op.getConstantOperandVal(6);
     Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
@@ -14330,48 +14518,209 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
 }
 
-/// Lower a load/store of a "VGPR as memory" object (an alloca in
-/// AMDGPUAS::VGPR) into an AMDGPUISD::REG_{LOAD,STORE} node carrying the
-/// constant dword index of the access within the per-function VGPR file. These
-/// nodes are selected into register copies via the SI_VGPR_FRAME_* pseudos and
-/// the AMDGPUPrivateObjectVGPRs pass.
+/// Lower a load/store of a "VGPR as memory" object (a global in AMDGPUAS::VGPR)
+/// into an AMDGPUISD::REG_{LOAD,STORE} node carrying the dword index of the
+/// access within the reserved VGPR file. A constant index selects the
+/// SI_VGPR_FRAME_* pseudos (rewritten to register copies by
+/// AMDGPUPrivateObjectVGPRs); a runtime index selects the SI_VGPR_FRAME_DYN_*
+/// pseudos (expanded to an indexed register move). Sub-dword (i8/i16) accesses
+/// are realized as a read-modify-write of the containing dword.
 ///
-/// Returns SDValue() if the access cannot (yet) be resolved to a constant file
-/// offset; such objects are demoted to scratch by AMDGPUPromoteAlloca, so any
-/// access that survives to here is expected to fold to a constant offset.
+/// Returns SDValue() for an access this routine does not (yet) handle (e.g. a
+/// wider-than-dword dynamic access), leaving it for the caller.
 SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
                                              SelectionDAG &DAG) const {
   MemSDNode *MemOp = cast<MemSDNode>(Op);
-  const MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
 
-  // Resolve the constant byte offset of the access within the VGPR file
-  // directly from the frame index (plus a constant GEP offset); the frame index
-  // itself is not custom-lowered.
+  // The "VGPR as memory" pointer value is the byte offset of the access within
+  // the reserved register file. After stripping a folded GEP offset, the base
+  // is one of: the addrspace(13) global itself (its offset is recorded as
+  // metadata) before LowerGlobalAddress runs; the constant that
+  // LowerGlobalAddress folds that global to; or a runtime value (dynamic
+  // index).
   SDValue Ptr = MemOp->getBasePtr();
   unsigned ExtraOffset = 0;
+  SDValue DynByteOffset; // non-constant byte offset, for a runtime index
   if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
-    auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1));
-    if (!C)
-      return SDValue();
-    ExtraOffset = C->getZExtValue();
+    if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1)))
+      ExtraOffset = C->getZExtValue();
+    else
+      DynByteOffset = Ptr.getOperand(1);
     Ptr = Ptr.getOperand(0);
   }
-  auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
-  if (!FI)
-    return SDValue();
-  const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
-  if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
-    return SDValue();
-  unsigned ByteOffset =
-      AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
-  if (ByteOffset % 4 != 0)
-    return SDValue();
 
+  unsigned ByteOffset = ExtraOffset;
+  if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
+    if (GA->getAddressSpace() != AMDGPUAS::VGPR)
+      return SDValue();
+    const auto *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+    if (!GV)
+      return SDValue();
+    MDNode *MD = GV->getMetadata("amdgpu.vgpr.memory.offset");
+    if (!MD)
+      return SDValue();
+    ByteOffset +=
+        mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue() +
+        GA->getOffset();
+  } else if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
+    ByteOffset += C->getZExtValue();
+  } else {
+    // The base is itself a runtime byte offset.
+    if (DynByteOffset)
+      return SDValue(); // two independent dynamic terms; unsupported
+    DynByteOffset = Ptr;
+  }
   EVT MemVT = MemOp->getMemoryVT();
   unsigned BitWidth = MemVT.getSizeInBits();
-  // Only whole-dword accesses are kept in registers; sub-dword and
-  // non-dword-multiple objects are demoted to scratch by AMDGPUPromoteAlloca.
+  MachineFunction &MFn = DAG.getMachineFunction();
+  SDValue Chain = MemOp->getChain();
+
+  auto GetDwordMMO = [&](MachineMemOperand::Flags F) {
+    return MFn.getMachineMemOperand(MemOp->getPointerInfo(), F, /*Size=*/4,
+                                    Align(4));
+  };
+
+  // Runtime (non-constant) index. The constant part of the address is folded in
+  // and divided by 4 to give the dword index, which the REG_{LOAD,STORE} node
+  // carries as a register. Sub-dword (8/16-bit) accesses extract from / read-
+  // modify-write the containing dword, with the bit position computed at
+  // runtime (this is race-free because vector registers are per-lane storage).
+  if (DynByteOffset) {
+    if (BitWidth != 8 && BitWidth != 16 && BitWidth != 32)
+      return SDValue();
+    SDValue DynI32 = DAG.getZExtOrTrunc(DynByteOffset, DL, MVT::i32);
+    SDValue Bytes = DAG.getNode(ISD::ADD, DL, MVT::i32, DynI32,
+                                DAG.getConstant(ByteOffset, DL, MVT::i32));
+    SDValue Index = DAG.getNode(ISD::SRL, DL, MVT::i32, Bytes,
+                                DAG.getConstant(2, DL, MVT::i32));
+
+    auto LoadDword = [&]() {
+      SDValue Ld = DAG.getMemIntrinsicNode(
+          AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+          {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+      Chain = Ld.getValue(1);
+      return Ld;
+    };
+
+    if (BitWidth == 8 || BitWidth == 16) {
+      // Bit position of the field within its dword, computed at runtime. An
+      // aligned i8/i16 access never crosses a dword boundary (the file is
+      // dword- aligned and the element-scaled offset keeps the field inside one
+      // dword).
+      SDValue ByteInDword = DAG.getNode(ISD::AND, DL, MVT::i32, Bytes,
+                                        DAG.getConstant(3, DL, MVT::i32));
+      SDValue BitInDword = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteInDword,
+                                       DAG.getConstant(3, DL, MVT::i32));
+      uint32_t LowMask = maskTrailingOnes<uint32_t>(BitWidth);
+      SDValue LowMaskC = DAG.getConstant(LowMask, DL, MVT::i32);
+
+      if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+        SDValue Old = LoadDword();
+        SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+        Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val, LowMaskC);
+        Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val, BitInDword);
+        SDValue MaskShifted =
+            DAG.getNode(ISD::SHL, DL, MVT::i32, LowMaskC, BitInDword);
+        SDValue Cleared = DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+                                      DAG.getNOT(DL, MaskShifted, MVT::i32));
+        SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+        return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+                                       DAG.getVTList(MVT::Other),
+                                       {Chain, New, Index}, MVT::i32,
+                                       GetDwordMMO(MachineMemOperand::MOStore));
+      }
+
+      auto *LoadOp = cast<LoadSDNode>(MemOp);
+      SDValue Field =
+          DAG.getNode(ISD::SRL, DL, MVT::i32, LoadDword(), BitInDword);
+      EVT ResVT = LoadOp->getValueType(0);
+      if (LoadOp->getExtensionType() == ISD::SEXTLOAD)
+        Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+                            DAG.getValueType(MemVT));
+      else
+        Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field, LowMaskC);
+      SDValue Result = ResVT == MVT::i32
+                           ? Field
+                           : DAG.getNode(ISD::TRUNCATE, DL, ResVT, Field);
+      return DAG.getMergeValues({Result, Chain}, DL);
+    }
+
+    // Whole 32-bit dynamic access.
+    if (ByteOffset % 4 != 0)
+      return SDValue();
+    if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+      SDValue Val = DAG.getBitcast(MVT::i32, StoreOp->getValue());
+      return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+                                     DAG.getVTList(MVT::Other),
+                                     {Chain, Val, Index}, MVT::i32,
+                                     GetDwordMMO(MachineMemOperand::MOStore));
+    }
+    auto *LoadOp = cast<LoadSDNode>(MemOp);
+    if (LoadOp->getExtensionType() != ISD::NON_EXTLOAD)
+      return SDValue();
+    SDValue Ld = LoadDword();
+    EVT ResVT = LoadOp->getValueType(0);
+    SDValue Res = ResVT == MVT::i32 ? Ld : DAG.getBitcast(ResVT, Ld);
+    return DAG.getMergeValues({Res, Chain}, DL);
+  }
+
+  // Sub-dword (8/16-bit) constant-index access. Registers have no sub-dword
+  // addressing, so the field is extracted from (loads) or inserted into (stores
+  // via read-modify-write) the dword that contains it, using shifts and masks.
+  if (BitWidth == 8 || BitWidth == 16) {
+    unsigned BitInDword = (ByteOffset % 4) * 8;
+    if (BitInDword + BitWidth > 32)
+      return SDValue(); // field crosses a dword boundary; unsupported
+    SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+    uint32_t LowMask = maskTrailingOnes<uint32_t>(BitWidth);
+
+    if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+      SDValue Old = DAG.getMemIntrinsicNode(
+          AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+          {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+      Chain = Old.getValue(1);
+      SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+      Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val,
+                        DAG.getConstant(LowMask, DL, MVT::i32));
+      if (BitInDword)
+        Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val,
+                          DAG.getConstant(BitInDword, DL, MVT::i32));
+      SDValue Cleared =
+          DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+                      DAG.getConstant(~(LowMask << BitInDword), DL, MVT::i32));
+      SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+                                     DAG.getVTList(MVT::Other),
+                                     {Chain, New, Index}, MVT::i32,
+                                     GetDwordMMO(MachineMemOperand::MOStore));
+    }
+
+    auto *LoadOp = cast<LoadSDNode>(MemOp);
+    SDValue Dword = DAG.getMemIntrinsicNode(
+        AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+        {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+    Chain = Dword.getValue(1);
+    SDValue Field = Dword;
+    if (BitInDword)
+      Field = DAG.getNode(ISD::SRL, DL, MVT::i32, Field,
+                          DAG.getConstant(BitInDword, DL, MVT::i32));
+    EVT ResVT = LoadOp->getValueType(0);
+    if (LoadOp->getExtensionType() == ISD::SEXTLOAD)
+      Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+                          DAG.getValueType(MemVT));
+    else
+      Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field,
+                          DAG.getConstant(LowMask, DL, MVT::i32));
+    SDValue Result = ResVT == MVT::i32
+                         ? Field
+                         : DAG.getNode(ISD::TRUNCATE, DL, ResVT, Field);
+    return DAG.getMergeValues({Result, Chain}, DL);
+  }
+
+  // Whole-dword accesses.
+  if (ByteOffset % 4 != 0)
+    return SDValue();
   if (BitWidth == 0 || BitWidth % 32 != 0)
     return SDValue();
   if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
@@ -14397,7 +14746,6 @@ SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
   }
 
   SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
-  SDValue Chain = MemOp->getChain();
   if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
     SDValue Value = StoreOp->getValue();
     if (RegVT != MemVT)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 37f3bb37d1aef..aa1b11e3c4c68 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -34,17 +34,16 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   const GCNSubtarget *Subtarget;
 
 public:
-  MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                    CallingConv::ID CC,
                                     EVT VT) const override;
   unsigned getNumRegistersForCallingConv(LLVMContext &Context,
                                          CallingConv::ID CC,
                                          EVT VT) const override;
 
-  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
-                                                CallingConv::ID CC, EVT VT,
-                                                EVT &IntermediateVT,
-                                                unsigned &NumIntermediates,
-                                                MVT &RegisterVT) const override;
+  unsigned getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const override;
 
   MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
 
@@ -73,7 +72,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
       AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
       AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
       AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
-  SDValue getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI,
+  SDValue getPreloadedValue(SelectionDAG &DAG,
+                            const SIMachineFunctionInfo &MFI,
                             EVT VT,
                             AMDGPUFunctionArgInfo::PreloadedValue) const;
 
@@ -81,8 +81,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                              SelectionDAG &DAG) const override;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT,
-                                 unsigned Offset) const;
+  SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+                                 MVT VT, unsigned Offset) const;
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG, bool WithChain) const;
   SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
@@ -134,8 +134,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG,
-                              ArrayRef<SDValue> Ops,
+  SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+                              SelectionDAG &DAG, ArrayRef<SDValue> Ops,
                               bool IsIntrinsic = false) const;
 
   SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
@@ -152,12 +152,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
-  SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL,
+  SDValue getFPExtOrFPRound(SelectionDAG &DAG,
+                            SDValue Op,
+                            const SDLoc &DL,
                             EVT VT) const;
 
-  SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
-                         SDValue Val, bool Signed,
-                         const ISD::InputArg *Arg = nullptr) const;
+  SDValue convertArgType(
+    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
+    bool Signed, const ISD::InputArg *Arg = nullptr) const;
 
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
@@ -193,10 +195,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
 
-  SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performUCharToFloatCombine(SDNode *N,
+                                     DAGCombinerInfo &DCI) const;
   SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT,
+  SDValue performSHLPtrCombine(SDNode *N,
+                               unsigned AS,
+                               EVT MemVT,
                                DAGCombinerInfo &DCI) const;
 
   SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
@@ -230,8 +235,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
-  unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0,
-                          const SDNode *N1) const;
+  unsigned getFusedOpcode(const SelectionDAG &DAG,
+                          const SDNode *N0, const SDNode *N1) const;
   SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
                                           DAGCombinerInfo &DCI) const;
@@ -394,7 +399,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   getPreferredVectorAction(MVT VT) const override;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                         Type *Ty) const override;
+                                        Type *Ty) const override;
 
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
@@ -413,8 +418,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool supportSplitCSR(MachineFunction *MF) const override;
   void initializeSplitCSR(MachineBasicBlock *Entry) const override;
   void insertCopiesSplitCSR(
-      MachineBasicBlock *Entry,
-      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+    MachineBasicBlock *Entry,
+    const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
@@ -422,8 +427,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
-  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                      bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv,
+                      MachineFunction &MF, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context, const Type *RetTy) const override;
 
@@ -432,11 +437,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
-  void
-  passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo,
-                    const SIMachineFunctionInfo &Info,
-                    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
-                    SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const;
+  void passSpecialInputs(
+    CallLoweringInfo &CLI,
+    CCState &CCInfo,
+    const SIMachineFunctionInfo &Info,
+    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+    SmallVectorImpl<SDValue> &MemOpChains,
+    SDValue Chain) const;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
                           CallingConv::ID CallConv, bool isVarArg,
@@ -447,11 +454,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool mayBeEmittedAsTailCall(const CallInst *) const override;
 
+  bool fallBackToDAGISel(const Instruction &Inst) const override;
+
   bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
@@ -467,7 +476,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
 
-  Register getRegisterByName(const char *RegName, LLT VT,
+  Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
   MachineBasicBlock *splitKillBlock(MachineInstr &MI,
@@ -524,7 +533,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
   bool checkAsmConstraintVal(SDValue Op, StringRef Constraint,
                              uint64_t Val) const;
-  bool checkAsmConstraintValA(SDValue Op, uint64_t Val,
+  bool checkAsmConstraintValA(SDValue Op,
+                              uint64_t Val,
                               unsigned MaxSize = 64) const;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
@@ -535,7 +545,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                                      const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
-  void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known,
+  void computeKnownBitsForFrameIndex(int FrameIdx,
+                                     KnownBits &Known,
                                      const MachineFunction &MF) const override;
   void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R,
                                       KnownBits &Known,
@@ -581,7 +592,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   void emitExpandAtomicLoad(LoadInst *LI) const override;
   void emitExpandAtomicStore(StoreInst *SI) const override;
 
-  LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+  LoadInst *
+  lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
 
   const TargetRegisterClass *getRegClassFor(MVT VT,
                                             bool isDivergent) const override;
@@ -591,7 +603,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   unsigned
   getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override;
 
-  void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF,
+  void allocateHSAUserSGPRs(CCState &CCInfo,
+                            MachineFunction &MF,
                             const SIRegisterInfo &TRI,
                             SIMachineFunctionInfo &Info) const;
 
@@ -606,21 +619,28 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                            const SIRegisterInfo &TRI,
                            SIMachineFunctionInfo &Info) const;
 
-  void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
+  void allocateSystemSGPRs(CCState &CCInfo,
+                           MachineFunction &MF,
                            SIMachineFunctionInfo &Info,
-                           CallingConv::ID CallConv, bool IsShader) const;
+                           CallingConv::ID CallConv,
+                           bool IsShader) const;
 
-  void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+  void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+                                      MachineFunction &MF,
                                       const SIRegisterInfo &TRI,
                                       SIMachineFunctionInfo &Info) const;
-  void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF,
-                                 const SIRegisterInfo &TRI,
-                                 SIMachineFunctionInfo &Info) const;
-
-  void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+  void allocateSpecialInputSGPRs(
+    CCState &CCInfo,
+    MachineFunction &MF,
+    const SIRegisterInfo &TRI,
+    SIMachineFunctionInfo &Info) const;
+
+  void allocateSpecialInputVGPRs(CCState &CCInfo,
+                                 MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) const;
-  void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF,
+  void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
+                                      MachineFunction &MF,
                                       const SIRegisterInfo &TRI,
                                       SIMachineFunctionInfo &Info) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 80a42a66b2368..88aa30ff206ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1244,11 +1244,10 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
 } // End Spill = 1, VALU = 1, isConvergent = 1
 
 // "VGPR as memory" pseudo accesses: a load/store of a whole VGPR tuple (one or
-// more dwords) from/to an alloca in the VGPR address space (AMDGPUAS::VGPR), at
-// a constant dword index within the per-function VGPR file. They are selected
-// from AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
-// register copies by the AMDGPUPrivateObjectVGPRs pass before register
-// allocation.
+// more dwords) from/to a global in the VGPR address space (AMDGPUAS::VGPR), at a
+// constant dword index within the reserved VGPR file. They are selected from
+// AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
+// register copies by the AMDGPUPrivateObjectVGPRs pass.
 let hasSideEffects = 0 in {
 foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
               VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
@@ -1266,6 +1265,25 @@ foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
 }
 } // End hasSideEffects = 0
 
+// "VGPR as memory" pseudo accesses at a *runtime* dword index. The index is a
+// VS_32 ($idx) that the custom inserter resolves into an indirect read/write
+// against the reserved VGPR file (movrel / s_set_gpr_idx, with a waterfall loop
+// for a divergent index), so unlike the constant pseudos these are expanded in
+// EmitInstrWithCustomInserter rather than by AMDGPUPrivateObjectVGPRs. Currently
+// only 32-bit accesses are handled dynamically.
+let usesCustomInserter = 1, hasSideEffects = 0, UseNamedOperandTable = 1 in {
+  def SI_VGPR_FRAME_DYN_LOAD_B32 : VPseudoInstSI <
+      (outs VGPR_32:$vdst), (ins VS_32:$idx)> {
+    let mayLoad = 1;
+    let mayStore = 0;
+  }
+  def SI_VGPR_FRAME_DYN_STORE_B32 : VPseudoInstSI <
+      (outs), (ins VGPR_32:$vdata, VS_32:$idx)> {
+    let mayLoad = 0;
+    let mayStore = 1;
+  }
+}
+
 // Select AMDGPUISD::REG_{LOAD,STORE} (with a constant dword index) into the
 // width-matched frame pseudo.
 multiclass VGPRFrameLoadStorePat<ValueType vt> {
@@ -1276,6 +1294,17 @@ multiclass VGPRFrameLoadStorePat<ValueType vt> {
                (store_inst $data, imm:$idx)>;
 }
 
+// Select AMDGPUISD::REG_{LOAD,STORE} with a non-constant dword index into the
+// dynamic pseudo. Lower complexity than the constant patterns above, so a
+// constant index still prefers them.
+multiclass VGPRFrameDynLoadStorePat<ValueType vt> {
+  def : GCNPat<(vt (SIreg_load i32:$idx)), (SI_VGPR_FRAME_DYN_LOAD_B32 $idx)>;
+  def : GCNPat<(SIreg_store vt:$data, i32:$idx),
+               (SI_VGPR_FRAME_DYN_STORE_B32 $data, $idx)>;
+}
+foreach vt = Reg32Types.types in
+defm : VGPRFrameDynLoadStorePat<vt>;
+
 foreach vt = !listconcat(
     Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
     Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4be4ce28e6de5..46687a6b061a7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -183,6 +183,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
   MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
       "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
 
+  // "VGPR as memory" file layout assigned module-wide by
+  // AMDGPULowerModuleVGPRs. When present, the file size is fixed here (offsets
+  // come from per-global metadata) and the base register is shared across the
+  // call graph.
+  if (F.hasFnAttribute("amdgpu-vgpr-memory-size"))
+    VGPRMemorySize =
+        F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-size", 0);
+  if (F.hasFnAttribute("amdgpu-vgpr-memory-base"))
+    VGPRMemoryBase =
+        F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-base", ~0u);
+
   // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
   // VGPR available at all times. For now, reserve highest available VGPR. After
   // RA, shift it to the lowest available unused VGPR if the one exist.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 1f43505650222..7568608a0b881 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -587,6 +587,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
   // the serialization easier.
   ReservedRegSet WWMReservedRegs;
 
+  // "VGPR as memory" (AMDGPUAS::VGPR / addrspace(13)) file, assigned by
+  // AMDGPULowerModuleVGPRs: VGPRMemorySize (bytes) comes from the
+  // "amdgpu-vgpr-memory-size" attribute (per-global offsets come from metadata)
+  // and VGPRMemoryBase is the shared base register index from
+  // "amdgpu-vgpr-memory-base" (~0u means "no module-assigned base; derive it").
+  // The file is reserved out of allocation for the whole function, like LDS,
+  // and accesses are lowered to register copies / indexed moves.
+  unsigned VGPRMemorySize = 0;
+  unsigned VGPRMemoryBase = ~0u;
+
   bool IsWholeWaveFunction = false;
 
   using PrologEpilogSGPRSpill =
@@ -690,6 +700,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
 
+  // "VGPR as memory" (addrspace(13)) file size in bytes (0 if the function has
+  // no such objects) and shared base register index, both assigned module-wide
+  // by AMDGPULowerModuleVGPRs (~0u base means the backend should derive it; see
+  // SIRegisterInfo::getVGPRMemoryFile).
+  unsigned getVGPRMemorySize() const { return VGPRMemorySize; }
+  unsigned getVGPRMemoryBase() const { return VGPRMemoryBase; }
+
   bool isWWMReservedRegister(Register Reg) const {
     return WWMReservedRegs.contains(Reg);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9700720f0373a..0103c2c22e481 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -584,6 +584,51 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
 }
 
+std::pair<unsigned, unsigned>
+SIRegisterInfo::getVGPRMemoryFile(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned Bytes = MFI->getVGPRMemorySize();
+  if (!Bytes)
+    return {0, 0};
+
+  // Round to an even number of dwords so wide (>=64-bit) tuple accesses start
+  // on an aligned register on targets that require aligned VGPR tuples.
+  unsigned Dwords = alignTo(divideCeil(Bytes, 4u), 2u);
+
+  // Compute the lowest base that clears this function's own ABI input registers
+  // (workitem ID for kernels, argument VGPRs for functions). The file sits at
+  // the low end so the register allocator stacks the rest of the function on
+  // top of it and the file costs only its own size in the VGPR count, rather
+  // than pinning occupancy as a high-end placement would.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned FirstFree = 0;
+  for (const auto &LI : MRI.liveins()) {
+    MCRegister Reg = LI.first;
+    const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+    if (!RC || !isVGPRClass(RC))
+      continue;
+    unsigned End = getHWRegIndex(Reg) + getRegSizeInBits(*RC) / 32u;
+    FirstFree = std::max(FirstFree, End);
+  }
+  unsigned BaseIdx = alignTo(FirstFree, 2u);
+
+  // Prefer the shared base assigned module-wide by AMDGPULowerModuleVGPRs: it
+  // is the same in every function of the call graph (so an address resolves to
+  // the same physical register everywhere) and is chosen to clear every
+  // participating function's inputs, hence it is at or above the local base.
+  unsigned SharedBase = MFI->getVGPRMemoryBase();
+  if (SharedBase != ~0u) {
+    assert(SharedBase >= BaseIdx &&
+           "shared VGPR-memory base overlaps this function's ABI inputs");
+    BaseIdx = SharedBase;
+  }
+
+  assert(BaseIdx + Dwords <=
+             ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) &&
+         "VGPR-as-memory file does not fit");
+  return {BaseIdx, Dwords};
+}
+
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   Reserved.set(AMDGPU::MODE);
@@ -747,6 +792,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (Register Reg : MFI->getWWMReservedRegs())
     reserveRegisterTuples(Reserved, Reg);
 
+  // Reserve the registers backing "VGPR as memory" (addrspace(13)) objects.
+  // The file occupies a fixed block of physical VGPRs (at the low end, just
+  // above the function's ABI inputs; see getVGPRMemoryFile) and must not be
+  // allocated for the whole function.
+  auto [VGPRMemBase, VGPRMemCount] = getVGPRMemoryFile(MF);
+  for (unsigned I = 0; I != VGPRMemCount; ++I)
+    reserveRegisterTuples(Reserved,
+                          AMDGPU::VGPR_32RegClass.getRegister(VGPRMemBase + I));
+
   // FIXME: Stop using reserved registers for this.
   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
     reserveRegisterTuples(Reserved, Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..afe0225bc0c0c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -96,6 +96,15 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
   bool isAsmClobberable(const MachineFunction &MF,
                         MCRegister PhysReg) const override;
 
+  /// The per-function "VGPR as memory" (addrspace(13)) register file is a fixed
+  /// block of physical VGPRs reserved for the whole function (and, like LDS,
+  /// placed at a location that is consistent across the call graph). Returns
+  /// the VGPR_32 register index of the first file register and the number of
+  /// dword registers it occupies, or {0, 0} if the function has no such
+  /// objects.
+  std::pair<unsigned, unsigned>
+  getVGPRMemoryFile(const MachineFunction &MF) const;
+
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7528cd2a009a3..96571dd028b14 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,7 +18,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/IR/LLVMContext.h"
@@ -1780,17 +1779,6 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
   return false;
 }
 
-AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
-  const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
-  assert(MD && MD->getNumOperands() == 2 &&
-         "expected !amdgpu.allocated.vgprs metadata with 2 operands");
-  unsigned Address =
-      mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
-  unsigned Size =
-      mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
-  return {Address, Size};
-}
-
 unsigned getVmcntBitMask(const IsaVersion &Version) {
   return (1 << (getVmcntBitWidthLo(Version.Major) +
                 getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b34dde7cb2cd7..1623dc72d2810 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -30,7 +30,6 @@ struct amd_kernel_code_t;
 namespace llvm {
 
 struct Align;
-class AllocaInst;
 class Argument;
 class Function;
 class GlobalValue;
@@ -1033,16 +1032,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
 /// Checks if \p Val is inside \p MD, a !range-like metadata.
 bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
 
-/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
-/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
-/// to within the VGPR file, and its size in bytes.
-struct AllocatedVGPRsMetadata {
-  unsigned Address;
-  unsigned Size;
-
-  static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
-};
-
 // The following methods are only meaningful on targets that support
 // S_WAITCNT.
 
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..67365cdc38b88 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -273,8 +273,10 @@ static std::string computeAMDDataLayout(const Triple &TT) {
   // (address space 7), and 128-bit non-integral buffer resourcees (address
   // space 8) which cannot be non-trivilally accessed by LLVM memory operations
   // like getelementptr.
+  // Address space 13 ("VGPR as memory") uses 32-bit register-relative indices.
   return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
-         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
+         "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:"
+         "64-"
          "v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
          "v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
 }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
deleted file mode 100644
index f6c64c5121867..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
-
-define void @vgpr_alloca() {
-; CHECK-LABEL: define void @vgpr_alloca(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  ret void
-}
-
-define void @vgpr_alloca_multiple() {
-; CHECK-LABEL: define void @vgpr_alloca_multiple(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
-; CHECK-NEXT:    [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT:    store i32 0, ptr addrspace(13) [[B]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca i32, align 4, addrspace(13)
-  %b = alloca [2 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  store i32 0, ptr addrspace(13) %b
-  ret void
-}
-
-define void @private_alloca_unchanged() {
-; CHECK-LABEL: define void @private_alloca_unchanged(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT:    store i64 42, ptr addrspace(5) [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i64], align 4, addrspace(5)
-  store i64 42, ptr addrspace(5) %a
-  ret void
-}
-
-declare void @use(ptr)
-
-; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
-; back to ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
-; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
-; CHECK-NEXT:    store i32 [[V]], ptr addrspace(5) [[P2]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
-  store i32 %v, ptr addrspace(13) %p
-  ret void
-}
-
-; A VGPR object whose address escapes (here via a cast to a generic pointer, as
-; the frontend emits) cannot be kept in registers yet, so it falls back to
-; ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_escaping() {
-; CHECK-LABEL: define void @vgpr_alloca_escaping(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
-; CHECK-NEXT:    call void @use(ptr [[CAST]])
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %cast = addrspacecast ptr addrspace(13) %a to ptr
-  call void @use(ptr %cast)
-  ret void
-}
-
-; Whole-dword-multiple accesses (here i64) stay in VGPRs.
-define void @vgpr_alloca_i64(i64 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_i64(
-; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
-; CHECK-NEXT:    store i64 [[V]], ptr addrspace(13) [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a = alloca i64, align 8, addrspace(13)
-  store i64 %v, ptr addrspace(13) %a
-  ret void
-}
-
-; Sub-dword accesses are not supported yet, so the object falls back to scratch.
-define void @vgpr_alloca_subdword(i16 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_subdword(
-; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
-; CHECK-NEXT:    store i16 [[V]], ptr addrspace(5) [[A1]], align 2
-; CHECK-NEXT:    ret void
-;
-  %a = alloca [2 x i16], align 4, addrspace(13)
-  store i16 %v, ptr addrspace(13) %a
-  ret void
-}
-;.
-; CHECK: [[META0]] = !{i32 0, i32 16}
-; CHECK: [[META1]] = !{i32 0, i32 4}
-; CHECK: [[META2]] = !{i32 4, i32 8}
-; CHECK: [[META3]] = !{i32 0, i32 8}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
deleted file mode 100644
index 0a78d119ded18..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and GFX12 and later. On a supported target the object is kept in addrspace(13)
-; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
-; scratch.
-
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a  -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-
-define void @vgpr_obj() {
-; SUPP:   alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
-; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  store i32 0, ptr addrspace(13) %a
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
deleted file mode 100644
index ea914907a900d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
-
-; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
-; indices must lower to register copies, never to scratch/buffer memory traffic.
-
-; CHECK-LABEL: store_load_i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_i32(i32 %v) {
-  %a = alloca i32, align 4, addrspace(13)
-  store i32 %v, ptr addrspace(13) %a
-  %l = load i32, ptr addrspace(13) %a
-  %r = add i32 %l, 1
-  ret i32 %r
-}
-
-; CHECK-LABEL: store_load_array:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_array(i32 %v) {
-  %a = alloca [4 x i32], align 4, addrspace(13)
-  %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
-  %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
-  store i32 %v, ptr addrspace(13) %p1
-  store i32 7, ptr addrspace(13) %p3
-  %l1 = load i32, ptr addrspace(13) %p1
-  %l3 = load i32, ptr addrspace(13) %p3
-  %s = add i32 %l1, %l3
-  ret i32 %s
-}
-
-; A 64-bit (two-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_i64:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i64 @store_load_i64(i64 %v) {
-  %a = alloca i64, align 8, addrspace(13)
-  store i64 %v, ptr addrspace(13) %a
-  %l = load i64, ptr addrspace(13) %a
-  %r = add i64 %l, 1
-  ret i64 %r
-}
-
-; A vector (four-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_v4i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
-  %a = alloca <4 x i32>, align 16, addrspace(13)
-  store <4 x i32> %v, ptr addrspace(13) %a
-  %l = load <4 x i32>, ptr addrspace(13) %a
-  ret <4 x i32> %l
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 94173fb7b11d2..2ddb7abc42ad4 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -27,9 +27,9 @@
 ; GCN-O0-NEXT: amdgpu-lower-exec-sync
 ; GCN-O0-NEXT: amdgpu-sw-lower-lds
 ; GCN-O0-NEXT: amdgpu-lower-module-lds
+; GCN-O0-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O0-NEXT: function
 ; GCN-O0-NEXT:   atomic-expand
-; GCN-O0-NEXT:   amdgpu-vgpr-allocate
 ; GCN-O0-NEXT:   verify
 ; GCN-O0-NEXT:   unreachableblockelim
 ; GCN-O0-NEXT:   ee-instrument<post-inline>
@@ -129,6 +129,7 @@
 ; GCN-O2-NEXT: amdgpu-lower-exec-sync
 ; GCN-O2-NEXT: amdgpu-sw-lower-lds
 ; GCN-O2-NEXT: amdgpu-lower-module-lds
+; GCN-O2-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O2-NEXT: function
 ; GCN-O2-NEXT:   amdgpu-atomic-optimizer
 ; GCN-O2-NEXT:   atomic-expand
@@ -315,6 +316,7 @@
 ; GCN-O3-NEXT: amdgpu-lower-exec-sync
 ; GCN-O3-NEXT: amdgpu-sw-lower-lds
 ; GCN-O3-NEXT: amdgpu-lower-module-lds
+; GCN-O3-NEXT: amdgpu-lower-module-vgprs
 ; GCN-O3-NEXT: function
 ; GCN-O3-NEXT:   amdgpu-atomic-optimizer
 ; GCN-O3-NEXT:   atomic-expand
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index aabfadd33e976..960cbb1a0def2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,15 +47,14 @@
 ; GCN-O0-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O0-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O0-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O0-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O0-NEXT:    FunctionPass Manager
 ; GCN-O0-NEXT:      Expand Atomic instructions
-; GCN-O0-NEXT:      Dominator Tree Construction
-; GCN-O0-NEXT:      Natural Loop Information
-; GCN-O0-NEXT:      AMDGPU VGPR Allocate
 ; GCN-O0-NEXT:      Remove unreachable blocks from the CFG
 ; GCN-O0-NEXT:      Instrument function entry/exit with calls to e.g. mcount() (post inlining)
 ; GCN-O0-NEXT:      Scalarize Masked Memory Intrinsics
 ; GCN-O0-NEXT:      Expand reduction intrinsics
+; GCN-O0-NEXT:      Dominator Tree Construction
 ; GCN-O0-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O0-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O0-NEXT:    AMDGPU lower intrinsics
@@ -117,7 +116,6 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
-; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Whole Quad Mode
 ; GCN-O0-NEXT:        AMDGPU Pre-RA Long Branch Reg
 ; GCN-O0-NEXT:        Fast Register Allocator
@@ -132,6 +130,7 @@
 ; GCN-O0-NEXT:        SI Lower WWM Copies
 ; GCN-O0-NEXT:        AMDGPU Reserve WWM Registers
 ; GCN-O0-NEXT:        Fast Register Allocator
+; GCN-O0-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O0-NEXT:        SI Fix VGPR copies
 ; GCN-O0-NEXT:        Remove Redundant DEBUG_VALUE analysis
 ; GCN-O0-NEXT:        Fixup Statepoint Caller Saved
@@ -210,6 +209,7 @@
 ; GCN-O1-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O1-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O1-NEXT:    FunctionPass Manager
 ; GCN-O1-NEXT:      Dominator Tree Construction
 ; GCN-O1-NEXT:      Cycle Info Analysis
@@ -362,7 +362,6 @@
 ; GCN-O1-NEXT:        Live Interval Analysis
 ; GCN-O1-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-NEXT:        Register Coalescer
-; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
@@ -402,6 +401,7 @@
 ; GCN-O1-NEXT:        Stack Slot Coloring
 ; GCN-O1-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O1-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-NEXT:        SI Fix VGPR copies
 ; GCN-O1-NEXT:        SI optimize exec mask operations
 ; GCN-O1-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -502,6 +502,7 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O1-OPTS-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O1-OPTS-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O1-OPTS-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O1-OPTS-NEXT:    FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Cycle Info Analysis
@@ -680,7 +681,6 @@
 ; GCN-O1-OPTS-NEXT:        Live Interval Analysis
 ; GCN-O1-OPTS-NEXT:        Machine Natural Loop Construction
 ; GCN-O1-OPTS-NEXT:        Register Coalescer
-; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        Rewrite Partial Register Uses
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
@@ -721,6 +721,7 @@
 ; GCN-O1-OPTS-NEXT:        Stack Slot Coloring
 ; GCN-O1-OPTS-NEXT:        Machine Copy Propagation Pass
 ; GCN-O1-OPTS-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O1-OPTS-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O1-OPTS-NEXT:        SI Fix VGPR copies
 ; GCN-O1-OPTS-NEXT:        SI optimize exec mask operations
 ; GCN-O1-OPTS-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -822,6 +823,7 @@
 ; GCN-O2-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O2-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O2-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O2-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Cycle Info Analysis
@@ -1003,7 +1005,6 @@
 ; GCN-O2-NEXT:        Live Interval Analysis
 ; GCN-O2-NEXT:        Machine Natural Loop Construction
 ; GCN-O2-NEXT:        Register Coalescer
-; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        Rewrite Partial Register Uses
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
@@ -1045,6 +1046,7 @@
 ; GCN-O2-NEXT:        Stack Slot Coloring
 ; GCN-O2-NEXT:        Machine Copy Propagation Pass
 ; GCN-O2-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O2-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O2-NEXT:        SI Fix VGPR copies
 ; GCN-O2-NEXT:        SI optimize exec mask operations
 ; GCN-O2-NEXT:        Remove Redundant DEBUG_VALUE analysis
@@ -1146,6 +1148,7 @@
 ; GCN-O3-NEXT:    AMDGPU lowering of execution synchronization
 ; GCN-O3-NEXT:    AMDGPU Software lowering of LDS
 ; GCN-O3-NEXT:    Lower uses of LDS variables from non-kernel functions
+; GCN-O3-NEXT:    AMDGPU Lower Module VGPRs
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Cycle Info Analysis
@@ -1340,7 +1343,6 @@
 ; GCN-O3-NEXT:        Live Interval Analysis
 ; GCN-O3-NEXT:        Machine Natural Loop Construction
 ; GCN-O3-NEXT:        Register Coalescer
-; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        Rewrite Partial Register Uses
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
@@ -1382,6 +1384,7 @@
 ; GCN-O3-NEXT:        Stack Slot Coloring
 ; GCN-O3-NEXT:        Machine Copy Propagation Pass
 ; GCN-O3-NEXT:        Machine Loop Invariant Code Motion
+; GCN-O3-NEXT:        AMDGPU Private Object VGPRs
 ; GCN-O3-NEXT:        SI Fix VGPR copies
 ; GCN-O3-NEXT:        SI optimize exec mask operations
 ; GCN-O3-NEXT:        Remove Redundant DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 1552014dc24e0..66c618782d955 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -55,7 +55,7 @@
 @nullptr12 = global ptr addrspace(12) addrspacecast (ptr null to ptr addrspace(12))
 
 ; CHECK-LABEL: nullptr13:
-; R600-NEXT: .long 0
+; CHECK-NEXT: .long -1
 @nullptr13 = global ptr addrspace(13) addrspacecast (ptr null to ptr addrspace(13))
 
 ; CHECK-LABEL: nullptr14:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index fc5dabc584863..1a73c35f83f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -49,6 +49,7 @@
 ; O0-NEXT: SI Lower WWM Copies
 ; O0-NEXT: AMDGPU Reserve WWM Registers
 ; O0-NEXT: Fast Register Allocator
+; O0-NEXT: AMDGPU Private Object VGPRs
 ; O0-NEXT: SI Fix VGPR copies
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
new file mode 100644
index 0000000000000..fb763cd31e339
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O2
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A "VGPR as memory" access through a constant-expression GEP must lower to a
+; register copy, not the pc-relative global-address sequence (which previously
+; crashed because addrspace(13) pointers are 32-bit). Exercised at -O0 too,
+; where the address is materialized standalone rather than folded.
+
+ at buf = internal addrspace(13) global [4 x i32] poison
+
+define void @store_constexpr_gep(i32 %v) {
+; O0-LABEL: store_constexpr_gep:
+; O0:       ; %bb.0:
+; O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT:    v_mov_b32_e32 v4, v0
+; O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; O2-LABEL: store_constexpr_gep:
+; O2:       ; %bb.0:
+; O2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT:    v_mov_b32_e32 v4, v0
+; O2-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+  ret void
+}
+
+define i32 @load_constexpr_gep() {
+; O0-LABEL: load_constexpr_gep:
+; O0:       ; %bb.0:
+; O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT:    v_mov_b32_e32 v0, v4
+; O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; O2-LABEL: load_constexpr_gep:
+; O2:       ; %bb.0:
+; O2-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT:    v_mov_b32_e32 v0, v4
+; O2-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+  ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
new file mode 100644
index 0000000000000..67b5d01df95b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s -o /dev/null
+
+; A runtime (non-constant) index into a "VGPR as memory" object becomes an
+; indexed move into the reserved VGPR file: s_set_gpr_idx on gfx9, movrel on
+; gfx10+, with a waterfall loop for a divergent index.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+ at buf8 = internal addrspace(13) global [16 x i8] poison
+ at buf16 = internal addrspace(13) global [16 x i16] poison
+
+define amdgpu_kernel void @dyn_uniform(ptr addrspace(1) %out, i32 %i, i32 %v) {
+; GFX942-LABEL: dyn_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_andn2_b32 s2, s2, -2.0
+; GFX942-NEXT:    v_mov_b32_e32 v18, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dword v0, v18, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    s_and_b32 m0, s2, 0x3fffffff
+; GFX11-NEXT:    v_movreld_b32_e32 v2, s3
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %i
+  store i32 %v, ptr addrspace(13) %p
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_load(ptr addrspace(1) %out) {
+; GFX942-LABEL: dyn_divergent_load:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v18, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v1, v18, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_load:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s2, exec_lo
+; GFX11-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s3
+; GFX11-NEXT:    v_movrels_b32_e32 v18, v2
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v1, v18, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_store(ptr addrspace(1) %out, i32 %v) {
+; GFX942-LABEL: dyn_divergent_store:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s0, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_store:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT:    s_mov_b32 s1, exec_lo
+; GFX11-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s1, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_movreld_b32_e32 v2, s0
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+; Sub-dword (i8/i16) at a runtime index: the containing dword is read-modify-
+; written with the bit position computed at runtime.
+define amdgpu_kernel void @dyn_i8_uniform(ptr addrspace(1) %out, i32 %i, i8 %v) {
+; GFX942-LABEL: dyn_i8_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s5, s2, 3
+; GFX942-NEXT:    s_and_b32 s4, s3, 0xff
+; GFX942-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX942-NEXT:    s_lshl_b32 s4, s4, s5
+; GFX942-NEXT:    s_lshl_b32 s5, 0xff, s5
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    v_not_b32_e32 v6, s5
+; GFX942-NEXT:    v_and_b32_e32 v1, v1, v6
+; GFX942-NEXT:    v_or_b32_e32 v1, s4, v1
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_i8_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s4, s2, 3
+; GFX11-NEXT:    s_lshr_b32 m0, s2, 2
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 3
+; GFX11-NEXT:    v_movrels_b32_e32 v0, v2
+; GFX11-NEXT:    s_lshl_b32 s2, 0xff, s4
+; GFX11-NEXT:    v_mov_b32_e32 v6, s3
+; GFX11-NEXT:    v_not_b32_e32 v1, s2
+; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, s4
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, s2, v0
+; GFX11-NEXT:    global_store_b8 v1, v6, s[0:1]
+; GFX11-NEXT:    v_movreld_b32_e32 v2, v0
+; GFX11-NEXT:    s_endpgm
+  %p = getelementptr [16 x i8], ptr addrspace(13) @buf8, i32 0, i32 %i
+  store i8 %v, ptr addrspace(13) %p
+  %l = load i8, ptr addrspace(13) %p
+  store i8 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @dyn_i16_divergent(ptr addrspace(1) %out, i16 %v) {
+; GFX942-LABEL: dyn_i16_divergent:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_and_b32_e32 v10, 0x3ff, v0
+; GFX942-NEXT:    s_mov_b32 s2, 0xffff
+; GFX942-NEXT:    v_lshlrev_b32_e32 v10, 4, v10
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s4, s6, 0xffff
+; GFX942-NEXT:    v_lshlrev_b32_e64 v11, v10, s2
+; GFX942-NEXT:    v_bfe_u32 v0, v0, 1, 9
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s5, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v12, v2
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    v_bfi_b32 v11, v11, 0, v12
+; GFX942-NEXT:    v_lshl_or_b32 v10, s4, v10, v11
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:  .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT:    s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT:    s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX942-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    s_xor_b64 exec, exec, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_3
+; GFX942-NEXT:  ; %bb.4:
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dyn_i16_divergent:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 1, 9
+; GFX11-NEXT:    s_mov_b32 s4, exec_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v10, 4, v1
+; GFX11-NEXT:    v_lshlrev_b32_e64 v11, v10, 0xffff
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s3, s2, 0xffff
+; GFX11-NEXT:  .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s5, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s5
+; GFX11-NEXT:    v_movrels_b32_e32 v12, v2
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX11-NEXT:  ; %bb.2:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_bfi_b32 v11, v11, 0, v12
+; GFX11-NEXT:    v_lshl_or_b32 v10, s3, v10, v11
+; GFX11-NEXT:    s_mov_b32 s3, exec_lo
+; GFX11-NEXT:  .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT:    s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT:    s_mov_b32 m0, s4
+; GFX11-NEXT:    v_movreld_b32_e32 v2, v10
+; GFX11-NEXT:    s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_cbranch_execnz .LBB4_3
+; GFX11-NEXT:  ; %bb.4:
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p = getelementptr [16 x i16], ptr addrspace(13) @buf16, i32 0, i32 %tid
+  store i16 %v, ptr addrspace(13) %p
+  %l = load i16, ptr addrspace(13) %p
+  store i16 %l, ptr addrspace(1) %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
new file mode 100644
index 0000000000000..0dc6dbca45480
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+
+; GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses;
+; fallBackToDAGISel makes such functions fall back to SelectionDAG, which lowers
+; them to register copies rather than crashing in reg-bank legalization.
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
new file mode 100644
index 0000000000000..6da6f49a9e082
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; AMDGPULowerModuleVGPRs lays out addrspace(13) globals into per-call-graph
+; groups: disjoint kernels (@k1/@a, @k2/@b) get independent layouts and bases,
+; while functions that share a global (@writer/@reader/@g, reached from @k3)
+; share one consistent group, so the address resolves to the same registers.
+
+ at a = internal addrspace(13) global [4 x i32] poison
+ at b = internal addrspace(13) global [8 x i32] poison
+ at g = internal addrspace(13) global i32 poison
+
+;.
+; CHECK: @a = internal addrspace(13) global [4 x i32] poison, !amdgpu.vgpr.memory.offset [[META0:![0-9]+]]
+; CHECK: @b = internal addrspace(13) global [8 x i32] poison, !amdgpu.vgpr.memory.offset [[META0]]
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset [[META0]]
+;.
+define amdgpu_kernel void @k1(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT:    store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @k2(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k2(
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT:    store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %p = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+  %l = load i32, ptr addrspace(13) %p
+  store i32 %l, ptr addrspace(1) %out
+  ret void
+}
+
+define void @writer(i32 %v) {
+; CHECK-LABEL: @writer(
+; CHECK-NEXT:    store i32 [[V:%.*]], ptr addrspace(13) @g, align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @reader() {
+; CHECK-LABEL: @reader(
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(13) @g, align 4
+; CHECK-NEXT:    ret i32 [[L]]
+;
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
+
+define amdgpu_kernel void @k3(ptr addrspace(1) %out, i32 %v) {
+; CHECK-LABEL: @k3(
+; CHECK-NEXT:    call void @writer(i32 [[V:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call i32 @reader()
+; CHECK-NEXT:    store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @writer(i32 %v)
+  %r = call i32 @reader()
+  store i32 %r, ptr addrspace(1) %out
+  ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="16" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="32" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="4" }
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
new file mode 100644
index 0000000000000..44193d15016f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; Sub-dword (i8/i16) "VGPR as memory" accesses at a constant index are realized
+; as a read-modify-write of the containing dword (shifts and masks), since
+; registers have no sub-dword addressing.
+
+ at b = internal addrspace(13) global [8 x i8] poison
+ at h = internal addrspace(13) global [4 x i16] poison
+
+define void @store_i8(i8 %v) {
+; CHECK-LABEL: store_i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffff00ff, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+  store i8 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define i8 @load_i8() {
+; CHECK-LABEL: load_i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 8, 8
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+  %l = load i8, ptr addrspace(13) %p
+  ret i8 %l
+}
+
+define void @store_i16(i16 %v) {
+; CHECK-LABEL: store_i16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+  store i16 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define signext i16 @load_i16_sext() {
+; CHECK-LABEL: load_i16_sext:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+  %l = load i16, ptr addrspace(13) %p
+  ret i16 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
new file mode 100644
index 0000000000000..b567867007fb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" (addrspace(13)) accesses at a constant index lower to plain
+; register copies to/from the reserved VGPR file - never to scratch or buffer
+; memory - and writer/reader of the same global resolve to the same register.
+
+ at g = internal addrspace(13) global i32 poison
+ at arr = internal addrspace(13) global [4 x i32] poison
+ at g64 = internal addrspace(13) global i64 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i32 %v, ptr addrspace(13) @g
+  ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i32, ptr addrspace(13) @g
+  ret i32 %l
+}
+
+define void @store_arr(i32 %v) {
+; CHECK-LABEL: store_arr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v4, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+  store i32 %v, ptr addrspace(13) %p
+  ret void
+}
+
+define i32 @load_arr() {
+; CHECK-LABEL: load_arr:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, v4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+  %l = load i32, ptr addrspace(13) %p
+  ret i32 %l
+}
+
+define void @store_i64(i64 %v) {
+; CHECK-LABEL: store_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i64 %v, ptr addrspace(13) @g64
+  ret void
+}
+
+define i64 @load_i64() {
+; CHECK-LABEL: load_i64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %l = load i64, ptr addrspace(13) @g64
+  ret i64 %l
+}
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index bd760de79c9d0..3ca15083959ad 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,24 +2,26 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: %alloca.13 = alloca i32, align 4, addrspace(13)
 define void @static_alloca() {
 entry:
   %alloca.0 = alloca i32, align 4
@@ -36,23 +38,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i32(i32 %n) {
 entry:
@@ -69,23 +71,23 @@ entry:
   ret void
 }
 
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
 ; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
 define void @dynamic_alloca_i64(i64 %n) {
 entry:
diff --git a/llvm/test/Verifier/AMDGPU/vgpr-memory.ll b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
new file mode 100644
index 0000000000000..ebc266b6cd276
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
@@ -0,0 +1,33 @@
+; RUN: not llvm-as %s --disable-output 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; A "VGPR as memory" global is register-backed: it has no defined initial
+; contents and no meaningful numeric address. Diagnostics are emitted for
+; instructions before global variables, so the checks are ordered to match.
+
+; CHECK: addrspacecast to or from the VGPR address space (13) is not allowed
+; CHECK-NEXT: %cast.to = addrspacecast ptr addrspace(13) @valid.poison to ptr
+; CHECK: addrspacecast to or from the VGPR address space (13) is not allowed
+; CHECK-NEXT: %cast.from = addrspacecast ptr %p to ptr addrspace(13)
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.init
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.zeroinit
+
+; A poison initializer (or none) is fine.
+ at valid.poison = internal addrspace(13) global i32 poison
+ at valid.array = internal addrspace(13) global [4 x i32] poison
+
+ at bad.init = internal addrspace(13) global i32 7
+ at bad.zeroinit = internal addrspace(13) global [2 x i32] zeroinitializer
+
+define ptr @cast_from_vgpr() {
+  %cast.to = addrspacecast ptr addrspace(13) @valid.poison to ptr
+  ret ptr %cast.to
+}
+
+define ptr addrspace(13) @cast_to_vgpr(ptr %p) {
+  %cast.from = addrspacecast ptr %p to ptr addrspace(13)
+  ret ptr addrspace(13) %cast.from
+}
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index a082adbf6565e..0ec3c753c10f1 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -43,14 +43,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
   // and that ANDGCN adds p7 and p8 as well.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
             "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
             "m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   // Check that the old AMDGCN p8:128:128 definition is upgraded
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
             "m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   // but that r600 does not.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
             "m:e-e-p:32:32-G1");
@@ -66,7 +66,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
       "m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
       "64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
       "1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
-      "128:48-p9:192:256:256:32");
+      "128:48-p9:192:256:256:32-p13:32:32");
 
   // Check that SystemZ adds -S64 if needed.
   EXPECT_EQ(UpgradeDataLayoutString(
@@ -158,24 +158,24 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
   EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
             "m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
             "m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
             "m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
-            "192:256:256:32");
+            "192:256:256:32-p13:32:32");
 
   // Check that AMDGCN targets don't add already declared address space 7.
-  EXPECT_EQ(
-      UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
-      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
-      "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
-      "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
+            "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
+  EXPECT_EQ(UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
+            "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
+            "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
 
   // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
   // flag.
@@ -216,9 +216,9 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
 
   // Check that AMDGPU targets add G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
-  EXPECT_EQ(
-      UpgradeDataLayoutString("", "amdgcn"),
-      "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+  EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
+            "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:"
+            "256:32-p13:32:32");
 
   // Check that SPIR & SPIRV targets add G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");