[clang] [llvm] [AMDGPU] Add initial support for VGPR as memory (PR #205435)
Gheorghe-Teodor Bercea via cfe-commits
cfe-commits at lists.llvm.org
Tue Jun 30 15:52:28 PDT 2026
https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/205435
>From b13adc506c96a4686ba827e24310ff273d53d917 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 19 Jun 2026 14:05:08 -0500
Subject: [PATCH 1/2] Add initial support for VGPR as memory
---
clang/include/clang/Basic/Attr.td | 8 +
clang/include/clang/Basic/AttrDocs.td | 20 ++
.../clang/Basic/DiagnosticCommonKinds.td | 5 +
.../clang/Basic/DiagnosticSemaKinds.td | 3 +
clang/include/clang/Sema/SemaAMDGPU.h | 1 +
clang/lib/CodeGen/CGDecl.cpp | 41 +++-
clang/lib/Sema/SemaAMDGPU.cpp | 14 ++
clang/lib/Sema/SemaDeclAttr.cpp | 3 +
.../CodeGenHIP/amdgpu-vgpr-O0-warning.hip | 14 ++
clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip | 19 ++
...a-attribute-supported-attributes-list.test | 1 +
clang/test/SemaCUDA/amdgpu-vgpr.cu | 28 +++
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 4 +
llvm/lib/IR/VerifierAMDGPU.cpp | 6 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 16 +-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +++++++++++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp | 145 +++++++++++
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 231 ++++++++++++++++--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 28 ++-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 19 ++
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 11 +
.../AMDGPU/amdgpu-vgpr-allocate-basic.ll | 109 +++++++++
.../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll | 20 ++
.../CodeGen/AMDGPU/as-vgpr-alloca-static.ll | 58 +++++
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +-
llvm/test/Verifier/AMDGPU/alloca.ll | 55 +++--
31 files changed, 983 insertions(+), 69 deletions(-)
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
create mode 100644 clang/test/SemaCUDA/amdgpu-vgpr.cu
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 3f57104d474a7..20f42ce4bd8f7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2522,6 +2522,14 @@ def AMDGPUMaxNumWorkGroups : InheritableAttr {
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
}
+def AMDGPUVGPR : InheritableAttr {
+ let Spellings = [Clang<"amdgpu_vgpr">];
+ let Documentation = [AMDGPUVGPRDocs];
+ let Subjects = SubjectList<[LocalVar], ErrorDiag>;
+ // Only meaningful in CUDA/HIP; semantic checks restrict it to kernel locals.
+ let LangOpts = [CUDA];
+}
+
def BPFPreserveAccessIndex : InheritableAttr,
TargetSpecificAttr<TargetBPF> {
let Spellings = [Clang<"preserve_access_index">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index d806adb4be4b8..7dcf35fe3bd83 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3604,6 +3604,26 @@ An error will be given if:
}];
}
+def AMDGPUVGPRDocs : Documentation {
+ let Category = DocCatAMDGPUAttributes;
+ let Content = [{
+This attribute requests that a kernel-local variable be allocated in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
+so that accesses with statically known indices lower to vector register
+copies instead of scratch memory traffic.
+
+Clang supports the ``__attribute__((amdgpu_vgpr))`` or
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
+local variables declared in a ``__global__`` (kernel) function; applying it to
+a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
+error.
+
+Known limitation: the request is only honored with optimizations enabled. At
+``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
+emitted.
+ }];
+}
+
def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
let Content = [{
Clang supports several different calling conventions, depending on the target
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..fe03be43c80c7 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,6 +319,11 @@ def warn_stack_protection_ignore_attribute : Warning<
"'stack_protector_ignore' attribute ignored due to "
"'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
+def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
+ "%0 is not guaranteed to keep the variable in vector registers at -O0; "
+ "it may fall back to scratch memory">,
+ InGroup<DiagGroup<"amdgpu-vgpr">>;
+
def warn_slh_does_not_support_asm_goto : Warning<
"speculative load hardening does not protect functions with asm goto">,
InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7360c9bbab60a..a5e56e94509da 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3711,6 +3711,9 @@ def err_attribute_argument_invalid : Error<
def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
"'amdgpu_flat_work_group_size' attribute must match "
"'reqd_work_group_size' product">;
+def err_amdgpu_vgpr_not_kernel_local : Error<
+ "%0 attribute can only be applied to local variables in "
+ "'__global__' (kernel) functions">;
def err_attribute_argument_is_zero : Error<
"%0 attribute must be greater than 0">;
def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index a6205534e0de3..9cb74ed74f4b9 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -79,6 +79,7 @@ class SemaAMDGPU : public SemaBase {
void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+ void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
/// Expand a valid use of the feature identification builtins into its
/// corresponding sequence of instructions.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 7608f8cb6fc7a..bca2d11d47c6a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -41,6 +41,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include <optional>
using namespace clang;
@@ -1601,9 +1602,37 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// Create the alloca. Note that we set the name separately from
// building the instruction so that it's there even in no-asserts
// builds.
- address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
- allocaAlignment, D.getName(),
- /*ArraySize=*/nullptr, &AllocaAddr);
+ //
+ // "VGPR as memory" objects keep their backing registers only once the
+ // optimizing register allocator runs. At -O0 the backend cannot lower
+ // these accesses (e.g. when the address escapes a basic block), so the
+ // request is not honored: fall back to an ordinary (scratch) alloca and
+ // warn, matching the documented behavior.
+ // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
+ // backing tuple to scratch) so this fallback can be removed.
+ const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+ const bool UseVGPRMemory =
+ VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
+ if (VGPRAttr && !UseVGPRMemory)
+ CGM.getDiags().Report(D.getLocation(),
+ diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
+ << VGPRAttr;
+
+ if (UseVGPRMemory) {
+ // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
+ // address space so that statically indexed accesses lower to vector
+ // register copies instead of scratch memory.
+ auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
+ /*ArraySize=*/nullptr, D.getName(),
+ AllocaInsertPt->getIterator());
+ AI->setAlignment(allocaAlignment.getAsAlign());
+ AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+ address = AllocaAddr;
+ } else {
+ address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
+ allocaAlignment, D.getName(),
+ /*ArraySize=*/nullptr, &AllocaAddr);
+ }
// Don't emit lifetime markers for MSVC catch parameters. The lifetime of
// the catch parameter starts in the catchpad instruction, and we can't
@@ -1612,8 +1641,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
// Emit a lifetime intrinsic if meaningful. There's no point in doing this
- // if we don't have a valid insertion point (?).
- if (HaveInsertPoint() && !IsMSCatchParam) {
+ // if we don't have a valid insertion point (?). "VGPR as memory" allocas
+ // live in a non-alloca address space, so the standard lifetime markers
+ // (which assume the alloca address space) are skipped for them.
+ if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
// If there's a jump into the lifetime of this variable, its lifetime
// gets broken up into several regions in IR, which requires more work
// to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index bd9e7e7b71ed6..0568ab0b60a07 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Attr.h"
#include "clang/AST/Decl.h"
#include "clang/AST/DynamicRecursiveASTVisitor.h"
#include "clang/AST/Expr.h"
@@ -626,6 +627,19 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
addAMDGPUFlatWorkGroupSizeAttr(D, AL, MinExpr, MaxExpr);
}
+void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
+ // The LocalVar subject list already guarantees this is a local variable.
+ // Restrict it further to locals declared directly in a __global__ kernel;
+ // it is meaningless (and an error) in __device__ or host functions.
+ const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
+ if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
+ Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+ return;
+ }
+
+ D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
+}
+
static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
Expr *MaxExpr,
const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2159c586e5738..095a11acdd02d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7641,6 +7641,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_AMDGPUNumVGPR:
S.AMDGPU().handleAMDGPUNumVGPRAttr(D, AL);
break;
+ case ParsedAttr::AT_AMDGPUVGPR:
+ S.AMDGPU().handleAMDGPUVGPRAttr(D, AL);
+ break;
case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
S.AMDGPU().handleAMDGPUMaxNumWorkGroupsAttr(D, AL);
break;
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
new file mode 100644
index 0000000000000..4d23008b8ef43
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// At -O0 "VGPR as memory" is not honored: the variable falls back to an
+// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
+
+#define __global__ __attribute__((global))
+
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
new file mode 100644
index 0000000000000..9a5c38e48951c
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -emit-llvm -O1 -disable-llvm-passes -o - %s \
+// RUN: | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
+// memory" address space (addrspace(13)), and its accesses stay in that space.
+
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
+// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
+// CHECK: store i32 %{{.*}}, ptr addrspace(13)
+// CHECK: load i32, ptr addrspace(13)
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr));
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 8bca68e2119e7..e79215f090214 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -7,6 +7,7 @@
// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUVGPR (SubjectMatchRule_variable_is_local)
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
// CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
// CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
new file mode 100644
index 0000000000000..6ad3074921b9b
--- /dev/null
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+__global__ void kernel() {
+ int ok[4] __attribute__((amdgpu_vgpr)); // OK
+ (void)ok;
+}
+
+__device__ void device_fn() {
+ int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+ (void)bad;
+}
+
+__host__ void host_fn() {
+ int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+ (void)bad;
+}
+
+// Not a local variable.
+int global_var __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+
+__global__ void takes_no_args() {
+ // Attribute does not accept arguments.
+ int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
+ (void)bad;
+}
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 01b1510524d0f..e9d3add54d054 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -47,6 +47,10 @@ enum : unsigned {
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.
+ VGPR = 13, ///< Address space for "VGPR as memory": objects backed by VGPRs
+ ///< rather than scratch. Shares its numeric value with the
+ ///< graphics-only CONSTANT_BUFFER_5 alias below.
+
RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
/// Internal address spaces. Can be freely renumbered.
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index 04cb214ef2520..de9a0c7bef132 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -122,8 +122,10 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
if (!VS.TT.isAMDGPU())
return;
- if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
- VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+ if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ AI.getAddressSpace() != AMDGPUAS::VGPR)
+ VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
+ &AI);
}
bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index e4367811f1ed4..d19333f14ee63 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca();
+FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
@@ -276,6 +276,20 @@ struct AMDGPUPromoteAllocaPass
TargetMachine &TM;
};
+void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+extern char &AMDGPUPrivateObjectVGPRsID;
+
+// Allocates pre-existing VGPR address space allocas without performing any
+// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
+// objects remain functional.
+struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
+ AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
struct AMDGPUPromoteAllocaToVectorPass
: OptionalPassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7330f3b13f3cb..8e289058a2ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -341,25 +343,159 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- if (!Subtarget->d16PreservesUnusedBits())
- return;
+// Resolve the constant byte offset within the per-function VGPR file for a
+// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
+// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
+// object.
+static std::optional<unsigned>
+getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
+ unsigned ExtraOffset = 0;
+ if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
+ ExtraOffset = C->getZExtValue();
+ Ptr = Ptr.getOperand(0);
+ }
+ }
+ auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FI)
+ return std::nullopt;
+ const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+ if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+ return std::nullopt;
+ return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+}
+
+// Lower a load/store of a "VGPR as memory" object into one
+// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
+// offset. The pseudos are later expanded into subregister copies by
+// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
+// split into their dword lanes; sub-dword and non-dword-multiple accesses are
+// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
+// true if \p N was rewritten.
+bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
+ if (auto *Load = dyn_cast<LoadSDNode>(N)) {
+ if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
+ Load->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+ EVT VT = Load->getValueType(0);
+ unsigned Bits = VT.getFixedSizeInBits();
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ std::optional<unsigned> Offset =
+ getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
+ if (!Offset || (*Offset % 4 != 0))
+ return false;
+
+ SDLoc DL(N);
+ unsigned NumDwords = Bits / 32;
+ SmallVector<SDValue, 4> Dwords;
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned I = 0; I != NumDwords; ++I) {
+ SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+ Load->getChain()};
+ MachineSDNode *Lane = CurDAG->getMachineNode(
+ AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
+ if (I == 0)
+ CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
+ Dwords.push_back(SDValue(Lane, 0));
+ Chains.push_back(SDValue(Lane, 1));
+ }
+
+ SDValue Val;
+ if (NumDwords == 1) {
+ Val = Dwords[0];
+ if (VT != MVT::i32)
+ Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
+ } else {
+ EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+ SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
+ Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
+ }
+ SDValue Chain = NumDwords == 1 ? Chains[0]
+ : CurDAG->getNode(ISD::TokenFactor, DL,
+ MVT::Other, Chains);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
+ return true;
+ }
+
+ if (auto *Store = dyn_cast<StoreSDNode>(N)) {
+ if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
+ Store->isTruncatingStore())
+ return false;
+ SDValue Val = Store->getValue();
+ EVT VT = Val.getValueType();
+ unsigned Bits = VT.getFixedSizeInBits();
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ std::optional<unsigned> Offset =
+ getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
+ if (!Offset || (*Offset % 4 != 0))
+ return false;
+
+ SDLoc DL(N);
+ unsigned NumDwords = Bits / 32;
+ SmallVector<SDValue, 4> Dwords;
+ if (NumDwords == 1) {
+ if (VT != MVT::i32)
+ Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
+ Dwords.push_back(Val);
+ } else {
+ EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+ SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
+ for (unsigned I = 0; I != NumDwords; ++I)
+ Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Vec,
+ CurDAG->getConstant(I, DL, MVT::i32)));
+ }
+
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned I = 0; I != NumDwords; ++I) {
+ SDValue Ops[] = {Dwords[I],
+ CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+ Store->getChain()};
+ MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
+ DL, MVT::Other, Ops);
+ if (I == 0)
+ CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
+ Chains.push_back(SDValue(Lane, 0));
+ }
+ SDValue Chain = NumDwords == 1 ? Chains[0]
+ : CurDAG->getNode(ISD::TokenFactor, DL,
+ MVT::Other, Chains);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
+ return true;
+ }
- SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+ return false;
+}
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
- while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = &*--Position;
- if (N->use_empty())
- continue;
-
- switch (N->getOpcode()) {
- case ISD::BUILD_VECTOR:
- // TODO: Match load d16 from shl (extload:i16), 16
- MadeChange |= matchLoadD16FromBuildVector(N);
- break;
- default:
- break;
+
+ // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
+ // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
+ SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
+ while (VGPRPos != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--VGPRPos;
+ MadeChange |= rewriteVGPRFrameAccess(N);
+ }
+
+ if (Subtarget->d16PreservesUnusedBits()) {
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // TODO: Match load d16 from shl (extload:i16), 16
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 95f85a6151375..cf62874912742 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,6 +67,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool runOnMachineFunction(MachineFunction &MF) override;
bool matchLoadD16FromBuildVector(SDNode *N) const;
+ bool rewriteVGPRFrameAccess(SDNode *N);
void PreprocessISelDAG() override;
void Select(SDNode *N) override;
void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index d052f3c73920c..376a1ebcc4256 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
AMDGPUPromoteAllocaToVectorPass(*this))
+FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
FUNCTION_PASS("amdgpu-promote-kernel-arguments",
AMDGPUPromoteKernelArgumentsPass())
FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
new file mode 100644
index 0000000000000..a3a1cf6f18bed
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -0,0 +1,145 @@
+//===-- AMDGPUPrivateObjectVGPRs.cpp - Lower VGPR-as-memory accesses ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
+/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
+/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
+/// carries a constant byte offset, which selects the dword (subregister) to
+/// copy.
+///
+/// This runs once the function is out of SSA form (so the single backing tuple
+/// can be defined by several subregister copies) and while LiveIntervals is
+/// available. The backing tuple has lane-divergent liveness (its subregisters
+/// are written and read independently), which the whole-register LiveVariables
+/// analysis cannot represent; the pass therefore updates the subregister-aware
+/// LiveIntervals directly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-private-object-vgprs"
+
+namespace {
+
+class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Private Object VGPRs";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addPreserved<LiveIntervalsWrapperPass>();
+ AU.addPreserved<SlotIndexesWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+ "AMDGPU Private Object VGPRs", false, false)
+
+char AMDGPUPrivateObjectVGPRs::ID = 0;
+
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+
+bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Collect the pseudos and determine how many dwords the backing tuple needs.
+ SmallVector<MachineInstr *, 8> Worklist;
+ unsigned NumDwords = 0;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
+ Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+ continue;
+ unsigned ByteOffset = MI.getOperand(1).getImm();
+ NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+ Worklist.push_back(&MI);
+ }
+ }
+
+ if (Worklist.empty())
+ return false;
+
+ LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+ const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
+ assert(RC && "no VGPR register class for VGPR-as-memory object");
+ Register Storage = MRI.createVirtualRegister(RC);
+
+ // Define the whole tuple up front so partial (subregister) writes and reads
+ // of uninitialized lanes are well formed.
+ MachineBasicBlock &Entry = MF.front();
+ MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
+ LIS->InsertMachineInstrInMaps(*ImpDef);
+
+ for (MachineInstr *MI : Worklist) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+ unsigned Dword = MI->getOperand(1).getImm() / 4;
+ unsigned SubReg = NumDwords == 1
+ ? AMDGPU::NoSubRegister
+ : SIRegisterInfo::getSubRegFromChannel(Dword);
+
+ MachineInstr *Copy;
+ if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+ Register Dst = MI->getOperand(0).getReg();
+ Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
+ .addReg(Storage, {}, SubReg);
+ } else {
+ Register Src = MI->getOperand(0).getReg();
+ Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
+ .addReg(Storage, RegState::Define, SubReg)
+ .addReg(Src);
+ }
+ // The copy takes the pseudo's slot, so the intervals of the copied
+ // load/store operand stay valid.
+ LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+ MI->eraseFromParent();
+ }
+
+ // The backing tuple is brand new; compute its (subregister) live interval.
+ LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
+
+ // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
+ // form disconnected value-number components within the single tuple, which an
+ // individual live interval must not contain. Split them into separate
+ // virtual registers, exactly as the register coalescer does for the intervals
+ // it leaves behind.
+ SmallVector<LiveInterval *, 4> SplitLIs;
+ LIS->splitSeparateComponents(LI, SplitLIs);
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 2223b9d036fa1..c587302c3bbae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,6 +35,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -138,6 +139,7 @@ class AMDGPUPromoteAllocaImpl {
unsigned MaxVGPRs;
unsigned VGPRBudgetRatio;
unsigned MaxVectorRegs;
+ unsigned AllocVGPROffset = 0;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -162,6 +164,10 @@ class AMDGPUPromoteAllocaImpl {
void analyzePromoteToVector(AllocaAnalysis &AA) const;
void promoteAllocaToVector(AllocaAnalysis &AA);
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+
+ /// Allocate an alloca that already lives in the VGPR address space to a range
+ /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
+ void allocateVgprs(AllocaAnalysis &AA);
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
SetVector<IntrinsicInst *> &DeferredIntrs);
void
@@ -179,7 +185,11 @@ class AMDGPUPromoteAllocaImpl {
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
}
- bool run(Function &F, bool PromoteToLDS);
+ /// IsLatePass is true when invoked as a codegen pass and false when invoked
+ /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
+ /// requests only the work strictly required for functionality (i.e. VGPR
+ /// allocation), skipping the optimization-oriented promotions.
+ bool run(Function &F, bool IsLatePass, bool NoOpt);
};
// FIXME: This can create globals so should be a module pass.
@@ -187,26 +197,34 @@ class AMDGPUPromoteAlloca : public FunctionPass {
public:
static char ID;
- AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+ explicit AMDGPUPromoteAlloca(
+ CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+ : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(), *F.getParent(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
- .run(F, /*PromoteToLDS*/ true);
+ .run(F, /*IsLatePass=*/true, NoOpt);
+ }
return false;
}
- StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+ StringRef getPassName() const override {
+ return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ bool NoOpt;
};
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -251,7 +269,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
- .run(F, /*PromoteToLDS=*/true);
+ .run(F, /*IsLatePass=*/true, /*NoOpt=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -264,7 +282,20 @@ PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
- .run(F, /*PromoteToLDS=*/false);
+ .run(F, /*IsLatePass=*/false, /*NoOpt=*/false);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+ return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
+ .run(F, /*IsLatePass=*/true, /*NoOpt=*/true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -273,8 +304,8 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
return PreservedAnalyses::all();
}
-FunctionPass *llvm::createAMDGPUPromoteAlloca() {
- return new AMDGPUPromoteAlloca();
+FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
+ return new AMDGPUPromoteAlloca(OptLevel);
}
bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -367,14 +398,121 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
}
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
- if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
+// A "VGPR as memory" object can only be realized in registers today when every
+// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
+// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
+// indexing and escaping addresses need gfx13 support, which is not yet
+// available; such objects fall back to scratch instead.
+//
+// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
+// support so this fallback is no longer needed.
+static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
+ const DataLayout &DL) {
+ // An access is lowerable if it covers a whole number of dwords and starts at
+ // a dword-aligned constant offset from the alloca.
+ auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
+ if (!Simple)
+ return false;
+ uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
+ DL, Off, /*AllowNonInbounds=*/true);
+ return Base == &AI && Off.urem(4) == 0;
+ };
+
+ SmallVector<const Use *, 16> Worklist;
+ for (const Use &U : AI.uses())
+ Worklist.push_back(&U);
+
+ while (!Worklist.empty()) {
+ const Use *U = Worklist.pop_back_val();
+ const User *Usr = U->getUser();
+
+ if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+ if (!GEP->hasAllConstantIndices())
+ return false;
+ for (const Use &GU : GEP->uses())
+ Worklist.push_back(&GU);
+ continue;
+ }
+ if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
+ if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
+ return false;
+ continue;
+ }
+ if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
+ // The pointer must be the address operand, not a stored value (escape).
+ if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
+ return false;
+ if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
+ SI->isSimple()))
+ return false;
+ continue;
+ }
+ // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
+ // otherwise not statically lowerable.
+ return false;
+ }
+ return true;
+}
+
+// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
+// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
+// falls back to ordinary scratch.
+static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
+ SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
+ for (Use *U : Uses) {
+ User *Usr = U->getUser();
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+ IRBuilder<> B(GEP);
+ SmallVector<Value *, 4> Indices(GEP->indices());
+ Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
+ GEP->getName(), GEP->getNoWrapFlags());
+ rewriteVGPRPointerToScratch(GEP, NewGEP);
+ GEP->eraseFromParent();
+ continue;
+ }
+ if (auto *II = dyn_cast<IntrinsicInst>(Usr);
+ II && II->isLifetimeStartOrEnd()) {
+ II->eraseFromParent();
+ continue;
+ }
+ // Loads, stores, address-space casts and call arguments only need this
+ // operand repointed; their result types do not depend on the operand's
+ // address space.
+ U->set(New);
+ }
+}
+
+static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
+ auto *NewAI = new AllocaInst(
+ AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
+ AI->getAlign(), AI->getName(), AI->getIterator());
+ NewAI->setDebugLoc(AI->getDebugLoc());
+ rewriteVGPRPointerToScratch(AI, NewAI);
+ AI->eraseFromParent();
+}
+
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
+ assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
+ if (!IsLatePass && DisablePromoteAllocaToVector)
return false;
+ bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
+ bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
+
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM, F) : 128;
setFunctionLimits(F);
+ // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
+ // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const bool TargetSupportsVGPRAsMemory =
+ ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
+
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
@@ -391,8 +529,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
AllocaAnalysis AA{AI};
+ if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
+ // Allocas that already live in the VGPR address space only need to be
+ // assigned VGPRs, which is required for functionality.
+ if (IsLatePass)
+ Allocas.push_back(std::move(AA));
+ continue;
+ }
+ if (!PromoteToVector && !PromoteToLDS)
+ continue;
if (collectAllocaUses(AA)) {
- analyzePromoteToVector(AA);
+ if (PromoteToVector)
+ analyzePromoteToVector(AA);
if (PromoteToLDS)
analyzePromoteToLDS(AA);
if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -403,8 +551,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
- stable_sort(Allocas,
- [](const auto &A, const auto &B) { return A.Score > B.Score; });
+ stable_sort(Allocas, [](const auto &A, const auto &B) {
+ // Prioritize pre-existing VGPR allocas, since their allocation must not
+ // fail.
+ bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+ bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+ if (AIsVGPR != BIsVGPR)
+ return AIsVGPR;
+ return A.Score > B.Score;
+ });
// clang-format off
LLVM_DEBUG(
@@ -417,6 +572,39 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool Changed = false;
SetVector<IntrinsicInst *> DeferredIntrs;
for (AllocaAnalysis &AA : Allocas) {
+ if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
+ // Fall back to scratch (and warn) when the object can't be kept in
+ // registers, so the program still compiles correctly: either the target
+ // does not support "VGPR as memory", or the access pattern (dynamic
+ // index, sub-dword, escaping address) is not yet supported.
+ const char *Unsupported = nullptr;
+ if (!TargetSupportsVGPRAsMemory)
+ Unsupported = "not supported on this target";
+ else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
+ Unsupported = "dynamic indexing, sub-dword access, or escaping address "
+ "is not yet supported";
+ if (Unsupported) {
+ F.getContext().diagnose(DiagnosticInfoUnsupported(
+ F,
+ Twine("'amdgpu_vgpr' object could not be kept in vector registers "
+ "(") +
+ Unsupported + "); using scratch memory instead",
+ AA.Alloca->getDebugLoc(), DS_Warning));
+ demoteVGPRAllocaToScratch(AA.Alloca);
+ Changed = true;
+ continue;
+ }
+ const unsigned AllocaCost =
+ AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
+ allocateVgprs(AA);
+ // Account for the consumed VGPRs in the vectorization budget.
+ if (VectorizationBudget > AllocaCost)
+ VectorizationBudget -= AllocaCost;
+ else
+ VectorizationBudget = 0;
+ Changed = true;
+ continue;
+ }
if (AA.Vector.Ty) {
std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
assert(Size); // Expected to succeed on non-array alloca.
@@ -451,6 +639,21 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
return Changed;
}
+void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
+ LLVMContext &Ctx = Mod->getContext();
+ const unsigned AllocaSize =
+ DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
+
+ // Record where the object was allocated within the VGPR file.
+ Type *I32 = Type::getInt32Ty(Ctx);
+ AA.Alloca->setMetadata(
+ "amdgpu.allocated.vgprs",
+ MDNode::get(
+ Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
+ ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
+ AllocVGPROffset += alignTo(AllocaSize, 4);
+}
+
// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 06bfc7e1a5162..7fc233be91fe0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -668,6 +668,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
+ initializeAMDGPUPrivateObjectVGPRsPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1500,9 +1501,12 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAtomicExpandLegacyPass());
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- addPass(createAMDGPUPromoteAlloca());
+ // With optimizations enabled, do the full promotion of allocas. Without
+ // optimizations, this only allocates pre-existing VGPR address space allocas,
+ // which is required for functionality.
+ addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
@@ -1717,6 +1721,11 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+ // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+ // At O0 there is no register coalescer; anchor on TwoAddress, where
+ // LiveIntervals is already available.
+ insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
+
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
TargetPassConfig::addFastRegAlloc();
@@ -1743,6 +1752,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+ // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+ // This runs after the coalescer so it does not perturb the kill flags that
+ // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
+ // the LiveIntervals the register allocator consumes next.
+ insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
+
if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
@@ -2283,8 +2298,15 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
addFunctionPass(AtomicExpandPass(TM), PMW);
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ // With optimizations enabled, do the full promotion of allocas. Without
+ // optimizations, only allocate pre-existing VGPR address space allocas, which
+ // is required for functionality.
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
+ else
+ addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
+
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses(PMW);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 46edc44e2cc05..dd25ab71997d7 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -100,6 +100,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUPreloadKernArgProlog.cpp
AMDGPUPreloadKernelArguments.cpp
AMDGPUPrintfRuntimeBinding.cpp
+ AMDGPUPrivateObjectVGPRs.cpp
AMDGPUPromoteAlloca.cpp
AMDGPUPromoteKernelArguments.cpp
AMDGPURegBankCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 750cb1973e21f..3594caef86782 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,6 +1243,25 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
}
} // End Spill = 1, VALU = 1, isConvergent = 1
+// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
+// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
+// within the per-function VGPR file. They are produced during instruction
+// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
+// pass before register allocation.
+let hasSideEffects = 0 in {
+def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins i32imm:$offset)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
+ (ins VGPR_32:$vdata, i32imm:$offset)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+} // End hasSideEffects = 0
+
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96571dd028b14..7528cd2a009a3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
@@ -1779,6 +1780,17 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
return false;
}
+AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
+ const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
+ assert(MD && MD->getNumOperands() == 2 &&
+ "expected !amdgpu.allocated.vgprs metadata with 2 operands");
+ unsigned Address =
+ mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+ unsigned Size =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ return {Address, Size};
+}
+
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 2c61abf946f99..923c5c3a988fd 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -31,6 +31,7 @@ struct amd_kernel_code_t;
namespace llvm {
struct Align;
+class AllocaInst;
class Argument;
class Function;
class GlobalValue;
@@ -1037,6 +1038,16 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
+/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
+/// to within the VGPR file, and its size in bytes.
+struct AllocatedVGPRsMetadata {
+ unsigned Address;
+ unsigned Size;
+
+ static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
+};
+
// The following methods are only meaningful on targets that support
// S_WAITCNT.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
new file mode 100644
index 0000000000000..f6c64c5121867
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
+
+define void @vgpr_alloca() {
+; CHECK-LABEL: define void @vgpr_alloca(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ ret void
+}
+
+define void @vgpr_alloca_multiple() {
+; CHECK-LABEL: define void @vgpr_alloca_multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
+; CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[B]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca i32, align 4, addrspace(13)
+ %b = alloca [2 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ store i32 0, ptr addrspace(13) %b
+ ret void
+}
+
+define void @private_alloca_unchanged() {
+; CHECK-LABEL: define void @private_alloca_unchanged(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: store i64 42, ptr addrspace(5) [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i64], align 4, addrspace(5)
+ store i64 42, ptr addrspace(5) %a
+ ret void
+}
+
+declare void @use(ptr)
+
+; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
+; back to ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
+; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
+; CHECK-NEXT: store i32 [[V]], ptr addrspace(5) [[P2]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+; A VGPR object whose address escapes (here via a cast to a generic pointer, as
+; the frontend emits) cannot be kept in registers yet, so it falls back to
+; ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_escaping() {
+; CHECK-LABEL: define void @vgpr_alloca_escaping(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
+; CHECK-NEXT: call void @use(ptr [[CAST]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %cast = addrspacecast ptr addrspace(13) %a to ptr
+ call void @use(ptr %cast)
+ ret void
+}
+
+; Whole-dword-multiple accesses (here i64) stay in VGPRs.
+define void @vgpr_alloca_i64(i64 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_i64(
+; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
+; CHECK-NEXT: store i64 [[V]], ptr addrspace(13) [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a = alloca i64, align 8, addrspace(13)
+ store i64 %v, ptr addrspace(13) %a
+ ret void
+}
+
+; Sub-dword accesses are not supported yet, so the object falls back to scratch.
+define void @vgpr_alloca_subdword(i16 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_subdword(
+; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
+; CHECK-NEXT: store i16 [[V]], ptr addrspace(5) [[A1]], align 2
+; CHECK-NEXT: ret void
+;
+ %a = alloca [2 x i16], align 4, addrspace(13)
+ store i16 %v, ptr addrspace(13) %a
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0, i32 16}
+; CHECK: [[META1]] = !{i32 0, i32 4}
+; CHECK: [[META2]] = !{i32 4, i32 8}
+; CHECK: [[META3]] = !{i32 0, i32 8}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
new file mode 100644
index 0000000000000..63ba44b479279
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -0,0 +1,20 @@
+; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
+; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
+; scratch.
+
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+
+define void @vgpr_obj() {
+; SUPP: alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
+; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
new file mode 100644
index 0000000000000..ea914907a900d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
@@ -0,0 +1,58 @@
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
+; indices must lower to register copies, never to scratch/buffer memory traffic.
+
+; CHECK-LABEL: store_load_i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_i32(i32 %v) {
+ %a = alloca i32, align 4, addrspace(13)
+ store i32 %v, ptr addrspace(13) %a
+ %l = load i32, ptr addrspace(13) %a
+ %r = add i32 %l, 1
+ ret i32 %r
+}
+
+; CHECK-LABEL: store_load_array:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_array(i32 %v) {
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
+ %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
+ store i32 %v, ptr addrspace(13) %p1
+ store i32 7, ptr addrspace(13) %p3
+ %l1 = load i32, ptr addrspace(13) %p1
+ %l3 = load i32, ptr addrspace(13) %p3
+ %s = add i32 %l1, %l3
+ ret i32 %s
+}
+
+; A 64-bit (two-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_i64:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i64 @store_load_i64(i64 %v) {
+ %a = alloca i64, align 8, addrspace(13)
+ store i64 %v, ptr addrspace(13) %a
+ %l = load i64, ptr addrspace(13) %a
+ %r = add i64 %l, 1
+ ret i64 %r
+}
+
+; A vector (four-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_v4i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
+ %a = alloca <4 x i32>, align 16, addrspace(13)
+ store <4 x i32> %v, ptr addrspace(13) %a
+ %l = load <4 x i32>, ptr addrspace(13) %a
+ ret <4 x i32> %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 0c591ec5b4669..0dbabd2991bc4 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -29,6 +29,7 @@
; GCN-O0-NEXT: amdgpu-lower-module-lds
; GCN-O0-NEXT: function
; GCN-O0-NEXT: atomic-expand
+; GCN-O0-NEXT: amdgpu-vgpr-allocate
; GCN-O0-NEXT: verify
; GCN-O0-NEXT: unreachableblockelim
; GCN-O0-NEXT: ee-instrument<post-inline>
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 070c873798647..aabfadd33e976 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,11 +49,13 @@
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand Atomic instructions
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Natural Loop Information
+; GCN-O0-NEXT: AMDGPU VGPR Allocate
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
-; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: AMDGPU lower intrinsics
@@ -115,6 +117,7 @@
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
+; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
@@ -359,6 +362,7 @@
; GCN-O1-NEXT: Live Interval Analysis
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Register Coalescer
+; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: Rename Disconnected Subregister Components
; GCN-O1-NEXT: Rewrite Partial Register Uses
; GCN-O1-NEXT: Machine Instruction Scheduler
@@ -676,6 +680,7 @@
; GCN-O1-OPTS-NEXT: Live Interval Analysis
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Register Coalescer
+; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
; GCN-O1-OPTS-NEXT: Rewrite Partial Register Uses
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
@@ -998,6 +1003,7 @@
; GCN-O2-NEXT: Live Interval Analysis
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Register Coalescer
+; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: Rename Disconnected Subregister Components
; GCN-O2-NEXT: Rewrite Partial Register Uses
; GCN-O2-NEXT: Machine Instruction Scheduler
@@ -1334,6 +1340,7 @@
; GCN-O3-NEXT: Live Interval Analysis
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Register Coalescer
+; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: Rename Disconnected Subregister Components
; GCN-O3-NEXT: Rewrite Partial Register Uses
; GCN-O3-NEXT: Machine Instruction Scheduler
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index f31d6228d7936..bd760de79c9d0 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,23 +2,23 @@
target triple = "amdgcn-amd-amdhsa"
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
define void @static_alloca() {
entry:
@@ -32,26 +32,27 @@ entry:
%alloca.7 = alloca i32, align 4, addrspace(7)
%alloca.8 = alloca i32, align 4, addrspace(8)
%alloca.9 = alloca i32, align 4, addrspace(9)
+ %alloca.13 = alloca i32, align 4, addrspace(13)
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
define void @dynamic_alloca_i32(i32 %n) {
entry:
@@ -68,23 +69,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
define void @dynamic_alloca_i64(i64 %n) {
entry:
>From 6d47f41fc10ba761312d9e98e15e51ed78c7e997 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Wed, 24 Jun 2026 19:32:40 -0500
Subject: [PATCH 2/2] Lower VGPR-as-memory accesses via REG_LOAD/REG_STORE
nodes
---
clang/include/clang/Basic/AttrDocs.td | 26 +-
.../clang/Basic/DiagnosticCommonKinds.td | 5 -
.../clang/Basic/DiagnosticSemaKinds.td | 12 +-
clang/include/clang/Sema/SemaAMDGPU.h | 4 +
clang/lib/CodeGen/CGDecl.cpp | 43 +-
clang/lib/Sema/SemaAMDGPU.cpp | 38 +-
clang/lib/Sema/SemaDecl.cpp | 2 +
clang/test/CodeGen/target-data.c | 4 +-
.../CodeGenHIP/amdgpu-vgpr-O0-warning.hip | 14 -
clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip | 19 +
clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip | 25 +-
clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl | 2 +-
clang/test/SemaCUDA/amdgpu-vgpr.cu | 26 +-
llvm/docs/AMDGPUUsage.rst | 37 +-
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 13 +-
llvm/lib/IR/AutoUpgrade.cpp | 5 +
llvm/lib/IR/Verifier.cpp | 7 +
llvm/lib/IR/VerifierAMDGPU.cpp | 51 ++-
llvm/lib/IR/VerifierInternal.h | 4 +
llvm/lib/Target/AMDGPU/AMDGPU.h | 17 +-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +------
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 -
.../Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp | 315 +++++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 3 +-
.../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp | 269 +++++++----
.../Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h | 23 +
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 231 +---------
.../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 14 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 48 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 433 +++++++++++++++++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 3 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 11 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 69 ++-
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 6 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 46 ++
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 8 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 -
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 +-
llvm/lib/TargetParser/TargetDataLayout.cpp | 4 +-
.../AMDGPU/amdgpu-vgpr-allocate-basic.ll | 109 -----
.../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll | 20 -
.../CodeGen/AMDGPU/as-vgpr-alloca-static.ll | 58 ---
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 7 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 19 +-
llvm/test/CodeGen/AMDGPU/nullptr.ll | 2 +-
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 1 +
.../AMDGPU/vgpr-as-memory-addrspacecast.ll | 49 ++
.../AMDGPU/vgpr-as-memory-callgraph.ll | 50 ++
.../AMDGPU/vgpr-as-memory-constexpr.ll | 44 ++
.../CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll | 346 ++++++++++++++
.../AMDGPU/vgpr-as-memory-error-const-oob.ll | 15 +
.../vgpr-as-memory-error-dynamic-toolarge.ll | 15 +
.../vgpr-as-memory-error-extern-call.ll | 19 +
.../vgpr-as-memory-error-indirect-call.ll | 15 +
.../vgpr-as-memory-error-inlineasm-clobber.ll | 15 +
.../vgpr-as-memory-error-ungrouped-call.ll | 19 +
.../vgpr-as-memory-error-unsupported-more.ll | 32 ++
.../vgpr-as-memory-error-unsupported.ll | 15 +
.../AMDGPU/vgpr-as-memory-function-ref.ll | 18 +
.../AMDGPU/vgpr-as-memory-gisel-fallback.ll | 28 ++
.../AMDGPU/vgpr-as-memory-lower-module.ll | 80 ++++
.../CodeGen/AMDGPU/vgpr-as-memory-subdword.ll | 63 +++
llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll | 75 +++
llvm/test/Verifier/AMDGPU/alloca.ll | 56 +--
llvm/test/Verifier/AMDGPU/vgpr-memory.ll | 49 ++
.../Bitcode/DataLayoutUpgradeTest.cpp | 38 +-
68 files changed, 2446 insertions(+), 859 deletions(-)
delete mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
create mode 100644 llvm/test/Verifier/AMDGPU/vgpr-memory.ll
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7dcf35fe3bd83..f8720cd67efad 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3607,20 +3607,22 @@ An error will be given if:
def AMDGPUVGPRDocs : Documentation {
let Category = DocCatAMDGPUAttributes;
let Content = [{
-This attribute requests that a kernel-local variable be allocated in the
-"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
-so that accesses with statically known indices lower to vector register
-copies instead of scratch memory traffic.
+This attribute requests that a device-side local variable be placed in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target, so that
+its accesses lower to vector register copies (constant index) or hardware
+register-indexing sequences (dynamic index) instead of scratch memory traffic.
+
+Such a variable is backed by a fixed block of vector registers rather than the
+stack, so - like an LDS/``__shared__`` variable - it is emitted as an internal
+global in ``addrspace(13)`` with a ``poison`` initializer; its contents are
+undefined until written. This is honored at every optimization level, including
+``-O0``.
Clang supports the ``__attribute__((amdgpu_vgpr))`` or
-``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
-local variables declared in a ``__global__`` (kernel) function; applying it to
-a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
-error.
-
-Known limitation: the request is only honored with optimizations enabled. At
-``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
-emitted.
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. Like ``__shared__``, it may be
+applied to a local variable in any device-side function (a ``__global__``
+kernel or a ``__device__`` function); applying it to a variable in host code is
+an error.
}];
}
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index fe03be43c80c7..f2ed2f4698b8d 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,11 +319,6 @@ def warn_stack_protection_ignore_attribute : Warning<
"'stack_protector_ignore' attribute ignored due to "
"'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
-def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
- "%0 is not guaranteed to keep the variable in vector registers at -O0; "
- "it may fall back to scratch memory">,
- InGroup<DiagGroup<"amdgpu-vgpr">>;
-
def warn_slh_does_not_support_asm_goto : Warning<
"speculative load hardening does not protect functions with asm goto">,
InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a5e56e94509da..74b933ec743bb 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3711,9 +3711,15 @@ def err_attribute_argument_invalid : Error<
def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
"'amdgpu_flat_work_group_size' attribute must match "
"'reqd_work_group_size' product">;
-def err_amdgpu_vgpr_not_kernel_local : Error<
- "%0 attribute can only be applied to local variables in "
- "'__global__' (kernel) functions">;
+def err_amdgpu_vgpr_host : Error<
+ "'amdgpu_vgpr' variables are not allowed in "
+ "%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
+def err_amdgpu_vgpr_initializer : Error<
+ "a variable with the 'amdgpu_vgpr' attribute cannot have an initializer; it "
+ "is backed by registers with undefined initial contents">;
+def err_amdgpu_vgpr_bad_storage : Error<
+ "the 'amdgpu_vgpr' attribute requires an automatic, fixed-size local "
+ "variable">;
def err_attribute_argument_is_zero : Error<
"%0 attribute must be greater than 0">;
def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index 9cb74ed74f4b9..1d604e547ff70 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -81,6 +81,10 @@ class SemaAMDGPU : public SemaBase {
void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
+ /// Diagnose constraints on an 'amdgpu_vgpr' variable that depend on its
+ /// initializer, once the declaration is complete.
+ void checkAMDGPUVGPRVarDecl(VarDecl *VD);
+
/// Expand a valid use of the feature identification builtins into its
/// corresponding sequence of instructions.
Expr *ExpandAMDGPUPredicateBuiltIn(Expr *CE);
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bca2d11d47c6a..570845f64e126 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1603,30 +1603,23 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// building the instruction so that it's there even in no-asserts
// builds.
//
- // "VGPR as memory" objects keep their backing registers only once the
- // optimizing register allocator runs. At -O0 the backend cannot lower
- // these accesses (e.g. when the address escapes a basic block), so the
- // request is not honored: fall back to an ordinary (scratch) alloca and
- // warn, matching the documented behavior.
- // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
- // backing tuple to scratch) so this fallback can be removed.
- const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+ // A "VGPR as memory" object (amdgpu_vgpr) is register-backed, not on the
+ // stack, so - like LDS/__shared__ - it is emitted as an internal global
+ // in AMDGPUAS::VGPR with a poison initializer (the registers have no
+ // defined initial value). Only in device compilation; on the host (e.g. a
+ // __host__ __device__ function compiled for the host) it falls back to an
+ // ordinary stack alloca.
const bool UseVGPRMemory =
- VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
- if (VGPRAttr && !UseVGPRMemory)
- CGM.getDiags().Report(D.getLocation(),
- diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
- << VGPRAttr;
-
+ D.hasAttr<AMDGPUVGPRAttr>() && getLangOpts().CUDAIsDevice;
if (UseVGPRMemory) {
- // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
- // address space so that statically indexed accesses lower to vector
- // register copies instead of scratch memory.
- auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
- /*ArraySize=*/nullptr, D.getName(),
- AllocaInsertPt->getIterator());
- AI->setAlignment(allocaAlignment.getAsAlign());
- AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), allocaTy, /*isConstant=*/false,
+ llvm::GlobalValue::InternalLinkage,
+ llvm::PoisonValue::get(allocaTy), getStaticDeclName(CGM, D),
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ llvm::AMDGPUAS::VGPR);
+ GV->setAlignment(allocaAlignment.getAsAlign());
+ AllocaAddr = RawAddress(GV, allocaTy, allocaAlignment, KnownNonNull);
address = AllocaAddr;
} else {
address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
@@ -1641,9 +1634,9 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
// Emit a lifetime intrinsic if meaningful. There's no point in doing this
- // if we don't have a valid insertion point (?). "VGPR as memory" allocas
- // live in a non-alloca address space, so the standard lifetime markers
- // (which assume the alloca address space) are skipped for them.
+ // if we don't have a valid insertion point (?). "VGPR as memory" objects
+ // are globals, not allocas, so the stack-slot lifetime markers are
+ // skipped.
if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
// If there's a jump into the lifetime of this variable, its lifetime
// gets broken up into several regions in IR, which requires more work
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 0568ab0b60a07..055bc373a623b 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -22,6 +22,7 @@
#include "clang/Sema/Ownership.h"
#include "clang/Sema/Scope.h"
#include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaCUDA.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
@@ -628,18 +629,41 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
}
void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
- // The LocalVar subject list already guarantees this is a local variable.
- // Restrict it further to locals declared directly in a __global__ kernel;
- // it is meaningless (and an error) in __device__ or host functions.
- const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
- if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
- Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+ // Like __shared__/LDS, this is device-side register storage, so it is allowed
+ // in any device-side function (kernel or __device__) and rejected only in
+ // host code. There is no kernel-only restriction: the backend handles direct
+ // references to the resulting addrspace(13) global from any function (e.g.
+ // ones IPO may introduce), independently of where the attribute was written.
+ if (SemaRef.getLangOpts().CUDA &&
+ SemaRef.CUDA().DiagIfHostCode(AL.getLoc(), diag::err_amdgpu_vgpr_host)
+ << SemaRef.CUDA().CurrentTarget())
return;
- }
D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
}
+void SemaAMDGPU::checkAMDGPUVGPRVarDecl(VarDecl *VD) {
+ if (!VD->hasAttr<AMDGPUVGPRAttr>() || VD->isInvalidDecl())
+ return;
+
+ // Only a fixed-size local is register-backed at codegen (the attribute's
+ // LocalVar subject already excludes static-storage locals); a variable-length
+ // array would silently ignore the attribute, so reject it.
+ if (VD->getType()->isVariablyModifiedType()) {
+ Diag(VD->getLocation(), diag::err_amdgpu_vgpr_bad_storage);
+ VD->setInvalidDecl();
+ return;
+ }
+
+ // "VGPR as memory" objects are backed by registers with no defined initial
+ // contents (like __shared__), so they cannot be initialized.
+ if (VD->hasInit()) {
+ Diag(VD->getLocation(), diag::err_amdgpu_vgpr_initializer)
+ << VD->getInit()->getSourceRange();
+ VD->setInvalidDecl();
+ }
+}
+
static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
Expr *MaxExpr,
const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d45c3eb35094f..40cf988add198 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -15342,6 +15342,8 @@ void Sema::FinalizeDeclaration(Decl *ThisDecl) {
if (getLangOpts().CUDA)
CUDA().checkAllowedInitializer(VD);
+ AMDGPU().checkAMDGPUVGPRVarDecl(VD);
+
// Grab the dllimport or dllexport attribute off of the VarDecl.
const InheritableAttr *DLLAttr = getDLLAttr(VD);
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index a5e0b814c7042..f03aaba8b53dd 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
deleted file mode 100644
index 4d23008b8ef43..0000000000000
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
-// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
-//
-// At -O0 "VGPR as memory" is not honored: the variable falls back to an
-// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
-
-#define __global__ __attribute__((global))
-
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
-__global__ void kernel(int *out, int i) {
- int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
- buf[2] = i;
- out[0] = buf[2];
-}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
new file mode 100644
index 0000000000000..b8618433055cb
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// "VGPR as memory" is honored at every optimization level (it is a global, not
+// an alloca that depends on the optimizing register allocator), so at -O0 the
+// variable is still emitted in addrspace(13) with no diagnostic.
+
+// expected-no-diagnostics
+
+#define __global__ __attribute__((global))
+
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr));
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
index 9a5c38e48951c..712a2121d19a8 100644
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -3,17 +3,30 @@
// RUN: | FileCheck %s
#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
-// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
-// memory" address space (addrspace(13)), and its accesses stay in that space.
+// A variable marked amdgpu_vgpr is emitted as an internal global in the "VGPR
+// as memory" address space (addrspace(13)) with a poison initializer (like an
+// LDS/__shared__ variable), and its accesses stay in that space. It is allowed
+// in a __device__ function too, not just a __global__ kernel.
+
+// CHECK-DAG: @{{.*}}kernel{{.*}} = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-DAG: @{{.*}}device{{.*}} = internal addrspace(13) global [4 x i32] poison, align 4
+
+// CHECK-LABEL: define {{.*}}@_Z6devicePii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}device
+__device__ void device(int *out, int i) {
+ int dbuf[4] __attribute__((amdgpu_vgpr));
+ dbuf[1] = i;
+ out[0] = dbuf[1];
+}
// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
-// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
-// CHECK: store i32 %{{.*}}, ptr addrspace(13)
-// CHECK: load i32, ptr addrspace(13)
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}kernel
+// CHECK: load i32, ptr addrspace(13) {{.*}}@{{.*}}kernel
__global__ void kernel(int *out, int i) {
int buf[4] __attribute__((amdgpu_vgpr));
buf[2] = i;
out[0] = buf[2];
+ device(out, i);
}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..f120db1aaf6cd 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
index 6ad3074921b9b..05f2d64d56344 100644
--- a/clang/test/SemaCUDA/amdgpu-vgpr.cu
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -8,13 +8,24 @@ __global__ void kernel() {
(void)ok;
}
-__device__ void device_fn() {
- int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+__global__ void initialized() {
+ // Register-backed storage has undefined initial contents, so (like
+ // __shared__) it cannot be initialized.
+ int bad __attribute__((amdgpu_vgpr)) = 7; // expected-error {{a variable with the 'amdgpu_vgpr' attribute cannot have an initializer}}
+ int arr[2] __attribute__((amdgpu_vgpr)) = {1, 2}; // expected-error {{a variable with the 'amdgpu_vgpr' attribute cannot have an initializer}}
(void)bad;
+ (void)arr;
+}
+
+__device__ void device_fn() {
+ // Allowed in device functions too (like __shared__); the backend handles
+ // references to the global from non-kernel functions.
+ int ok __attribute__((amdgpu_vgpr)); // OK
+ (void)ok;
}
__host__ void host_fn() {
- int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+ int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' variables are not allowed in __host__ functions}}
(void)bad;
}
@@ -26,3 +37,12 @@ __global__ void takes_no_args() {
int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
(void)bad;
}
+
+__global__ void bad_storage(int n) {
+ // A static-storage local is not a LocalVar subject; a VLA is rejected as not
+ // fixed-size. Both must avoid silently ignoring the attribute.
+ static int s __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+ int vla[n] __attribute__((amdgpu_vgpr)); // expected-error {{the 'amdgpu_vgpr' attribute requires an automatic, fixed-size local variable}}
+ (void)s;
+ (void)vla;
+}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 2b522395ee892..b5b56fe2a1310 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -994,7 +994,7 @@ supported for the ``amdgcn`` target.
*reserved for future use* 10
*reserved for future use* 11
*reserved for downstream use (LLPC)* 12
- *reserved for future use* 13
+ VGPR as memory 13 N/A VGPR 32 0xFFFFFFFF
*reserved for future use* 14
*reserved for future use* 16
Streamout Registers 128 N/A GS_REGS
@@ -1104,6 +1104,41 @@ supported for the ``amdgcn`` target.
When using code object V5 ``LIBOMPTARGET_STACK_SIZE`` may be used to provide the
private segment size in bytes, for cases where a dynamic stack is used.
+**VGPR as memory**
+ The "VGPR as memory" address space holds small objects directly in vector
+ registers instead of scratch (private) memory, avoiding memory traffic for
+ frequently accessed kernel-local data. Objects in this address space are
+ represented as global variables (similar to how *Local* memory uses LDS
+ global variables) and are backed by a block of physical VGPRs that is
+ reserved out of the register allocator for the duration of the function.
+
+ An address in this space is a register-relative dword index into the reserved
+ VGPR block, not a byte address into an addressable memory segment. A load or
+ store at a constant index lowers to a register copy to/from a fixed VGPR; a
+ load or store at a variable (dynamic) index lowers to a hardware register
+ indexing sequence. Sub-dword (8/16-bit) accesses are implemented as
+ read-modify-write of the containing dword.
+
+ An ``addrspacecast`` to or from this address space has no meaningful
+ translation (there is no real address to convert): it is permitted but lowers
+ to ``poison``. ``ptrtoint``/``inttoptr`` are also permitted, but they are
+ *not* poison - the integer is the register-relative byte offset, so an access
+ through an ``inttoptr`` value is lowered as a dynamic (runtime-indexed)
+ access and clamped into the reserved block like any other dynamic index. The
+ verifier still rejects an initializer on such a global variable, atomic
+ accesses, and memory intrinsics
+ (``llvm.memcpy``/``memset``/``memmove``), none of which can be modelled by
+ register storage. The numeric value 13 it uses coincides with the
+ graphics-only ``CONSTANT_BUFFER_5`` alias, which never co-exists with this
+ feature.
+
+ The backing registers have no defined initial contents: reading an object
+ before it is written is undefined behavior and may observe values left in
+ those physical VGPRs by a previously executed wave, just like reading
+ uninitialized scratch or LDS. The reserved block is per-lane and private to
+ the function's call graph; an out-of-range dynamic index is clamped into the
+ block so it cannot read or modify unrelated registers.
+
**Constant 32-bit**
*TODO*
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index e9d3add54d054..5c536883c6636 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -96,17 +96,25 @@ namespace AMDGPU {
enum class FlatAddrSpace : unsigned { FLAT, FlatGlobal, FlatScratch };
inline bool isFlatGlobalAddrSpace(unsigned AS) {
+ // AMDGPUAS::VGPR is register-backed, not flat-addressable (see its enum
+ // note).
return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+ AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
}
inline bool isExtendedGlobalAddrSpace(unsigned AS) {
+ // AMDGPUAS::VGPR is register-backed, not global (see its enum note).
return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+ (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
}
inline bool isConstantAddressSpace(unsigned AS) {
+ // AMDGPUAS::VGPR is register-backed read/write storage, not constant memory,
+ // despite aliasing CONSTANT_BUFFER_5 (see its enum note).
+ if (AS == AMDGPUAS::VGPR)
+ return false;
switch (AS) {
using namespace AMDGPUAS;
case CONSTANT_ADDRESS:
@@ -185,6 +193,7 @@ constexpr int64_t getNullPointerValue(unsigned AS) {
case PRIVATE_ADDRESS:
case LOCAL_ADDRESS:
case REGION_ADDRESS:
+ case VGPR:
return -1;
default:
return 0;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3a823f906b012..c753e9e2bf56a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6851,6 +6851,11 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
if (!DL.contains("-p9") && !DL.starts_with("p9"))
Res.append("-p9:192:256:256:32");
+
+ // Add sizing for address space 13 ("VGPR as memory"), 32-bit
+ // register-relative indices.
+ if (!DL.contains("-p13") && !DL.starts_with("p13"))
+ Res.append("-p13:32:32");
}
// Upgrade the ELF mangling mode.
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index a4e0f531ab1ef..64046f8e53a20 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -807,6 +807,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
"Global variable is too large to fit into the address space", &GV,
GVType);
+ // Target-specific global variable checks.
+ verifyAMDGPUGlobalVariable(*this, GV);
+
if (!GV.hasInitializer()) {
visitGlobalValue(GV);
return;
@@ -4568,6 +4571,7 @@ void Verifier::visitLoadInst(LoadInst &LI) {
ElTy, &LI);
checkAtomicMemAccessSize(ElTy, &LI);
+ verifyAMDGPUAtomicAccess(*this, LI.getPointerAddressSpace(), &LI);
} else {
Check(LI.getSyncScopeID() == SyncScope::System,
"Non-atomic load cannot have SynchronizationScope specified", &LI);
@@ -4596,6 +4600,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
"point, or vector type!",
ElTy, &SI);
checkAtomicMemAccessSize(ElTy, &SI);
+ verifyAMDGPUAtomicAccess(*this, SI.getPointerAddressSpace(), &SI);
} else {
Check(SI.getSyncScopeID() == SyncScope::System,
"Non-atomic store cannot have SynchronizationScope specified", &SI);
@@ -4674,6 +4679,7 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
Check(ElTy->isIntOrPtrTy(),
"cmpxchg operand must have integer or pointer type", ElTy, &CXI);
checkAtomicMemAccessSize(ElTy, &CXI);
+ verifyAMDGPUAtomicAccess(*this, CXI.getPointerAddressSpace(), &CXI);
visitInstruction(CXI);
}
@@ -4713,6 +4719,7 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
checkAtomicMemAccessSize(ElTy, &RMWI);
Check(AtomicRMWInst::FIRST_BINOP <= Op && Op <= AtomicRMWInst::LAST_BINOP,
"Invalid binary operation!", &RMWI);
+ verifyAMDGPUAtomicAccess(*this, RMWI.getPointerAddressSpace(), &RMWI);
visitInstruction(RMWI);
}
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index de9a0c7bef132..bbf52af565b20 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
@@ -122,10 +123,37 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
if (!VS.TT.isAMDGPU())
return;
- if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
- AI.getAddressSpace() != AMDGPUAS::VGPR)
- VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
- &AI);
+ if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+}
+
+void llvm::verifyAMDGPUGlobalVariable(VerifierSupport &VS,
+ const GlobalVariable &GV) {
+ if (!VS.TT.isAMDGPU())
+ return;
+
+ if (GV.getAddressSpace() != AMDGPUAS::VGPR)
+ return;
+
+ // "VGPR as memory" objects are backed by registers, which have no defined
+ // initial contents, so (like LDS) they cannot be statically initialized: the
+ // only permitted initializer is an undef/poison placeholder (isa<UndefValue>
+ // also matches poison).
+ Check(!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer()),
+ "global variable in the VGPR address space (13) cannot have an "
+ "initializer",
+ &GV);
+}
+
+void llvm::verifyAMDGPUAtomicAccess(VerifierSupport &VS, unsigned AS,
+ const Value *V) {
+ if (!VS.TT.isAMDGPU())
+ return;
+
+ // "VGPR as memory" is per-lane register storage, so an atomic access to it is
+ // meaningless and unsupported.
+ Check(AS != AMDGPUAS::VGPR,
+ "atomic operations on the VGPR address space (13) are not allowed", V);
}
bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
@@ -139,6 +167,21 @@ bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
void llvm::verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
CallBase &Call) {
+ // No intrinsic models "VGPR as memory" (only plain load/store is supported),
+ // so an addrspace(13) pointer argument - to a memory intrinsic, masked
+ // load/store, gather/scatter, ptrmask, etc. - would be mishandled.
+ if (VS.TT.isAMDGPU())
+ for (const Value *Op : Call.args()) {
+ Type *T = Op->getType();
+ if (T->isPtrOrPtrVectorTy() &&
+ T->getPointerAddressSpace() == AMDGPUAS::VGPR) {
+ VS.CheckFailed("intrinsic with a VGPR address space (13) pointer "
+ "argument is not allowed",
+ &Call);
+ break;
+ }
+ }
+
switch (ID) {
default:
return;
diff --git a/llvm/lib/IR/VerifierInternal.h b/llvm/lib/IR/VerifierInternal.h
index 70d1521475198..94ac405f10885 100644
--- a/llvm/lib/IR/VerifierInternal.h
+++ b/llvm/lib/IR/VerifierInternal.h
@@ -221,6 +221,10 @@ void verifyAMDGPUFunctionMetadata(VerifierSupport &VS, const Function &F);
void verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI);
+void verifyAMDGPUGlobalVariable(VerifierSupport &VS, const GlobalVariable &GV);
+
+void verifyAMDGPUAtomicAccess(VerifierSupport &VS, unsigned AS, const Value *V);
+
void verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
CallBase &Call);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index d19333f14ee63..2f3ae2ad7ecbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
+FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
@@ -276,18 +276,15 @@ struct AMDGPUPromoteAllocaPass
TargetMachine &TM;
};
-void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+void initializeAMDGPUPrivateObjectVGPRsLegacyPass(PassRegistry &);
extern char &AMDGPUPrivateObjectVGPRsID;
-// Allocates pre-existing VGPR address space allocas without performing any
-// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
-// objects remain functional.
-struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
- AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ModulePass *createAMDGPULowerModuleVGPRsPass();
+void initializeAMDGPULowerModuleVGPRsPass(PassRegistry &);
+extern char &AMDGPULowerModuleVGPRsID;
-private:
- TargetMachine &TM;
+struct AMDGPULowerModuleVGPRsPass : PassInfoMixin<AMDGPULowerModuleVGPRsPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
struct AMDGPUPromoteAllocaToVectorPass
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e289058a2ed1..7330f3b13f3cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,10 +21,8 @@
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -343,159 +341,25 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
-// Resolve the constant byte offset within the per-function VGPR file for a
-// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
-// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
-// object.
-static std::optional<unsigned>
-getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
- unsigned ExtraOffset = 0;
- if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
- if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
- ExtraOffset = C->getZExtValue();
- Ptr = Ptr.getOperand(0);
- }
- }
- auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
- if (!FI)
- return std::nullopt;
- const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
- if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
- return std::nullopt;
- return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
-}
-
-// Lower a load/store of a "VGPR as memory" object into one
-// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
-// offset. The pseudos are later expanded into subregister copies by
-// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
-// split into their dword lanes; sub-dword and non-dword-multiple accesses are
-// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
-// true if \p N was rewritten.
-bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
- if (auto *Load = dyn_cast<LoadSDNode>(N)) {
- if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
- Load->getExtensionType() != ISD::NON_EXTLOAD)
- return false;
- EVT VT = Load->getValueType(0);
- unsigned Bits = VT.getFixedSizeInBits();
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- std::optional<unsigned> Offset =
- getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
- if (!Offset || (*Offset % 4 != 0))
- return false;
-
- SDLoc DL(N);
- unsigned NumDwords = Bits / 32;
- SmallVector<SDValue, 4> Dwords;
- SmallVector<SDValue, 4> Chains;
- for (unsigned I = 0; I != NumDwords; ++I) {
- SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
- Load->getChain()};
- MachineSDNode *Lane = CurDAG->getMachineNode(
- AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
- if (I == 0)
- CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
- Dwords.push_back(SDValue(Lane, 0));
- Chains.push_back(SDValue(Lane, 1));
- }
-
- SDValue Val;
- if (NumDwords == 1) {
- Val = Dwords[0];
- if (VT != MVT::i32)
- Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
- } else {
- EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
- SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
- Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
- }
- SDValue Chain = NumDwords == 1 ? Chains[0]
- : CurDAG->getNode(ISD::TokenFactor, DL,
- MVT::Other, Chains);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
- return true;
- }
-
- if (auto *Store = dyn_cast<StoreSDNode>(N)) {
- if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
- Store->isTruncatingStore())
- return false;
- SDValue Val = Store->getValue();
- EVT VT = Val.getValueType();
- unsigned Bits = VT.getFixedSizeInBits();
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- std::optional<unsigned> Offset =
- getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
- if (!Offset || (*Offset % 4 != 0))
- return false;
-
- SDLoc DL(N);
- unsigned NumDwords = Bits / 32;
- SmallVector<SDValue, 4> Dwords;
- if (NumDwords == 1) {
- if (VT != MVT::i32)
- Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
- Dwords.push_back(Val);
- } else {
- EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
- SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
- for (unsigned I = 0; I != NumDwords; ++I)
- Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
- Vec,
- CurDAG->getConstant(I, DL, MVT::i32)));
- }
-
- SmallVector<SDValue, 4> Chains;
- for (unsigned I = 0; I != NumDwords; ++I) {
- SDValue Ops[] = {Dwords[I],
- CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
- Store->getChain()};
- MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
- DL, MVT::Other, Ops);
- if (I == 0)
- CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
- Chains.push_back(SDValue(Lane, 0));
- }
- SDValue Chain = NumDwords == 1 ? Chains[0]
- : CurDAG->getNode(ISD::TokenFactor, DL,
- MVT::Other, Chains);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
- return true;
- }
-
- return false;
-}
-
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- bool MadeChange = false;
+ if (!Subtarget->d16PreservesUnusedBits())
+ return;
- // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
- // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
- SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
- while (VGPRPos != CurDAG->allnodes_begin()) {
- SDNode *N = &*--VGPRPos;
- MadeChange |= rewriteVGPRFrameAccess(N);
- }
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
- if (Subtarget->d16PreservesUnusedBits()) {
- SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
- while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = &*--Position;
- if (N->use_empty())
- continue;
-
- switch (N->getOpcode()) {
- case ISD::BUILD_VECTOR:
- // TODO: Match load d16 from shl (extload:i16), 16
- MadeChange |= matchLoadD16FromBuildVector(N);
- break;
- default:
- break;
- }
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // TODO: Match load d16 from shl (extload:i16), 16
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index cf62874912742..95f85a6151375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,7 +67,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool runOnMachineFunction(MachineFunction &MF) override;
bool matchLoadD16FromBuildVector(SDNode *N) const;
- bool rewriteVGPRFrameAccess(SDNode *N);
void PreprocessISelDAG() override;
void Select(SDNode *N) override;
void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
new file mode 100644
index 0000000000000..25df6c9283bd3
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
@@ -0,0 +1,315 @@
+//===- AMDGPULowerModuleVGPRs.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lays out a module's "VGPR as memory" (addrspace(13)) globals into one shared
+// register "file" and records where it lives on every function whose call graph
+// uses it.
+//
+// The file is backed by a fixed block of physical VGPRs, so for an address into
+// it to be meaningful across calls every function in the call graph must agree
+// on (a) each global's byte offset and (b) the file's base register. The
+// backend can derive a base per function (just above its ABI inputs) but those
+// differ, so (b) is resolved module-wide:
+//
+// * Offsets: all globals are packed into one deterministic layout; each
+// global's byte offset is recorded as "amdgpu.vgpr.memory.offset" metadata.
+// * Base: one index, the max ABI-input VGPR boundary over all participating
+// functions, so it clears every function's inputs yet stays as low as
+// possible to preserve occupancy.
+//
+// The file size and base are attached as the "amdgpu-vgpr-memory-size" and
+// "amdgpu-vgpr-memory-base" attributes to every function whose call graph uses
+// the file (like LDS, it is live for a using kernel's whole execution, so all
+// reachable functions must reserve it). The backend consumes these:
+// SIISelLowering reads the offset metadata; SIMachineFunctionInfo reads the
+// attributes; SIRegisterInfo::getVGPRMemoryFile reserves [base, base + size).
+//
+// TODO: one module-wide layout makes every using function reserve all globals,
+// and a function reachable from several kernels reserve the file even for a
+// kernel that does not use it. A per-kernel layout (as AMDGPULowerModuleLDS
+// does, with a table for shared callees) would tighten this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-module-vgprs"
+
+namespace {
+
+constexpr char SizeAttr[] = "amdgpu-vgpr-memory-size";
+constexpr char BaseAttr[] = "amdgpu-vgpr-memory-base";
+constexpr char OffsetMD[] = "amdgpu.vgpr.memory.offset";
+
+// The fixed device-function ABI keeps the work-item ID in this register
+// (SITargetLowering::allocateSpecialInputVGPRsFixed). The shared file must not
+// overlap it.
+constexpr unsigned FixedWorkitemRegIdx = 31;
+
+// True if F may read the work-item ID (and so needs its work-item-ID input
+// register), per the amdgpu-no-workitem-id-* attributes.
+static bool usesWorkitemID(const Function &F) {
+ return !F.hasFnAttribute("amdgpu-no-workitem-id-x") ||
+ !F.hasFnAttribute("amdgpu-no-workitem-id-y") ||
+ !F.hasFnAttribute("amdgpu-no-workitem-id-z");
+}
+
+// Upper bound on the low contiguous VGPRs occupied by F's ABI inputs - the
+// registers the shared file must sit above. The fixed device-function ABI also
+// keeps the work-item ID in the high register v31 (see usesFixedWorkitemReg);
+// that is modelled separately, not counted here.
+static unsigned inputVGPRBound(const Function &F) {
+ // Compute kernels take args in the kernarg segment, not VGPRs; their only
+ // VGPR input is the work-item ID, packed into a single low register.
+ if (AMDGPU::isKernel(F.getCallingConv()))
+ return usesWorkitemID(F) ? 1 : 0;
+
+ // Graphics entry points and ordinary functions pass their arguments in VGPRs
+ // (except inreg arguments, which go in SGPRs).
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ unsigned N = 0;
+ for (const Argument &A : F.args()) {
+ if (A.hasAttribute(Attribute::InReg))
+ continue;
+ unsigned Dwords =
+ divideCeil(DL.getTypeAllocSize(A.getType()).getFixedValue(), 4u);
+ // A multi-dword argument tuple is even-aligned on targets that require
+ // aligned VGPR tuples. Model that gap conservatively so the shared base
+ // never lands below such an argument register (the backend's overlap check
+ // in getVGPRMemoryFile is the backstop if this is ever too low).
+ if (Dwords > 1)
+ N = alignTo(N, 2u);
+ N += Dwords;
+ }
+ return N;
+}
+
+// True if F is a callable (non-entry) device function on the default ABI, which
+// keeps the work-item ID in the fixed high register v31. The shared file must
+// not overlap v31 in such a function.
+static bool usesFixedWorkitemReg(const Function &F) {
+ CallingConv::ID CC = F.getCallingConv();
+ return !AMDGPU::isEntryFunctionCC(CC) && !AMDGPU::isGraphics(CC) &&
+ usesWorkitemID(F);
+}
+
+class AMDGPULowerModuleVGPRs : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerModuleVGPRs() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override { return "AMDGPU Lower Module VGPRs"; }
+};
+
+} // end anonymous namespace
+
+char AMDGPULowerModuleVGPRs::ID = 0;
+char &llvm::AMDGPULowerModuleVGPRsID = AMDGPULowerModuleVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleVGPRs, DEBUG_TYPE, "AMDGPU Lower Module VGPRs",
+ false, false)
+
+ModulePass *llvm::createAMDGPULowerModuleVGPRsPass() {
+ return new AMDGPULowerModuleVGPRs();
+}
+
+static bool lowerModuleVGPRs(Module &M) {
+ SmallVector<GlobalVariable *, 8> Globals;
+ for (GlobalVariable &GV : M.globals())
+ if (GV.getAddressSpace() == AMDGPUAS::VGPR)
+ Globals.push_back(&GV);
+ if (Globals.empty())
+ return false;
+
+ // In one walk over each defined function, map it to the addrspace(13) globals
+ // it directly references and collect its ordinary calls (non-intrinsic,
+ // non-inline-asm) for the later reserved-register-clobber check, so the
+ // module is not traversed twice.
+ DenseMap<Function *, SmallVector<GlobalVariable *, 2>> Uses;
+ DenseMap<Function *, SmallVector<const CallBase *, 2>> Calls;
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ SmallPtrSet<GlobalVariable *, 4> Seen;
+ for (Instruction &I : instructions(F)) {
+ if (const auto *CB = dyn_cast<CallBase>(&I))
+ if (!CB->isInlineAsm() &&
+ CB->getIntrinsicID() == Intrinsic::not_intrinsic)
+ Calls[&F].push_back(CB);
+ for (Value *Op : I.operands()) {
+ // Only pointer operands can name a global; skipping the rest avoids a
+ // getUnderlyingObject call per non-pointer operand on every compile.
+ if (!Op->getType()->isPtrOrPtrVectorTy())
+ continue;
+ // getUnderlyingObject sees through constant-expression GEPs/casts, so
+ // a global referenced via e.g. getelementptr(@g, off) is found.
+ auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Op));
+ if (GV && GV->getAddressSpace() == AMDGPUAS::VGPR &&
+ Seen.insert(GV).second)
+ Uses[&F].push_back(GV);
+ }
+ }
+ }
+ if (Uses.empty())
+ return true; // nothing references the file
+
+ CallGraph CG(M);
+ auto Reachable = [&](Function *Root, SmallPtrSetImpl<Function *> &Out) {
+ SmallVector<Function *, 16> Work{Root};
+ while (!Work.empty()) {
+ Function *F = Work.pop_back_val();
+ if (!Out.insert(F).second)
+ continue;
+ if (CallGraphNode *N = CG[F])
+ for (auto &CR : *N)
+ if (Function *Callee = CR.second->getFunction())
+ if (!Callee->isDeclaration())
+ Work.push_back(Callee);
+ }
+ };
+
+ // Partition functions and globals into independent layout groups: a group
+ // covers everything reachable from a using kernel (the file is live for its
+ // whole execution, like LDS) plus every function that uses each global. So
+ // disjoint kernels get independent (low, occupancy-friendly) bases while
+ // shared functions stay in one group. Functions and globals are both
+ // GlobalValues, so one union-find covers both.
+ EquivalenceClasses<const GlobalValue *> Groups;
+ for (auto &[F, GVs] : Uses)
+ for (GlobalVariable *GV : GVs)
+ Groups.unionSets(F, GV);
+
+ // Functions reachable from each file-using kernel join that kernel's group
+ // (so they reserve the file), and kernels sharing any callee merge.
+ for (Function &K : M) {
+ if (K.isDeclaration() || !AMDGPU::isEntryFunctionCC(K.getCallingConv()))
+ continue;
+ SmallPtrSet<Function *, 16> R;
+ Reachable(&K, R);
+ if (llvm::none_of(R, [&](Function *F) { return Uses.count(F); }))
+ continue; // this kernel does not use the file
+ for (Function *F : R)
+ Groups.unionSets(&K, F);
+ }
+
+ const DataLayout &DL = M.getDataLayout();
+ LLVMContext &Ctx = M.getContext();
+ Type *I32 = Type::getInt32Ty(Ctx);
+
+ // Lay out each group independently.
+ for (auto It = Groups.begin(), E = Groups.end(); It != E; ++It) {
+ const auto *Leader = *It;
+ if (!Leader->isLeader())
+ continue;
+ SmallVector<GlobalVariable *, 8> GroupGlobals;
+ SmallVector<Function *, 16> GroupFns;
+ for (auto MI = Groups.member_begin(*Leader); MI != Groups.member_end();
+ ++MI) {
+ const GlobalValue *GV = *MI;
+ if (auto *G = dyn_cast<GlobalVariable>(GV))
+ GroupGlobals.push_back(const_cast<GlobalVariable *>(G));
+ else
+ GroupFns.push_back(const_cast<Function *>(cast<Function>(GV)));
+ }
+ if (GroupGlobals.empty() || GroupFns.empty())
+ continue;
+
+ // Deterministic packed layout (sorted by name).
+ llvm::stable_sort(GroupGlobals, [](GlobalVariable *A, GlobalVariable *B) {
+ return A->getName() < B->getName();
+ });
+ unsigned Size = 0;
+ for (GlobalVariable *GV : GroupGlobals) {
+ Align A = std::max(
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()),
+ Align(4));
+ unsigned Offset = alignTo(Size, A);
+ GV->setMetadata(OffsetMD,
+ MDNode::get(Ctx, {ConstantAsMetadata::get(
+ ConstantInt::get(I32, Offset))}));
+ Size = Offset + DL.getTypeAllocSize(GV->getValueType()).getFixedValue();
+ }
+
+ // One base for the group: above every member's low ABI-input VGPRs,
+ // even-aligned.
+ unsigned Base = 0;
+ bool ClearsFixedWorkitem = false;
+ for (Function *F : GroupFns) {
+ Base = std::max(Base, inputVGPRBound(*F));
+ ClearsFixedWorkitem |= usesFixedWorkitemReg(*F);
+ }
+ Base = alignTo(Base, 2u);
+
+ // The fixed device-function ABI keeps the work-item ID in v31. A small file
+ // sits below it; if the file would grow into v31, place it above instead
+ // (at an occupancy cost) so it never overlaps that input.
+ unsigned Dwords = AMDGPU::getVGPRMemoryFileDwords(Size);
+ if (ClearsFixedWorkitem && Base <= FixedWorkitemRegIdx &&
+ Base + Dwords > FixedWorkitemRegIdx)
+ Base = alignTo(FixedWorkitemRegIdx + 1, 2u);
+
+ // The file lives in low, caller-saved VGPRs that only group members
+ // reserve. A call to anything outside the group - indirect, external, or a
+ // defined non-member - does not reserve the file and clobbers it, so
+ // diagnose rather than silently corrupt it. (Direct calls between members
+ // are safe; intrinsics don't clobber.) Calls introduced after this pass
+ // (e.g. expanded libcalls) and inline asm clobbering a file register are
+ // caught later, in AMDGPUPrivateObjectVGPRs, where the machine-level calls
+ // and the final reserved registers are known.
+ SmallPtrSet<const Function *, 16> GroupFnSet(GroupFns.begin(),
+ GroupFns.end());
+ for (Function *F : GroupFns)
+ for (const CallBase *CB : Calls.lookup(F)) {
+ const Function *Callee = CB->getCalledFunction();
+ if (!Callee || !GroupFnSet.contains(Callee))
+ Ctx.diagnose(DiagnosticInfoUnsupported(
+ *F,
+ "'VGPR as memory' is not supported in a function that makes an "
+ "indirect call or a call outside its call graph",
+ CB->getDebugLoc()));
+ }
+
+ for (Function *F : GroupFns) {
+ F->addFnAttr(SizeAttr, utostr(Size));
+ F->addFnAttr(BaseAttr, utostr(Base));
+ }
+ }
+ return true;
+}
+
+bool AMDGPULowerModuleVGPRs::runOnModule(Module &M) {
+ return lowerModuleVGPRs(M);
+}
+
+PreservedAnalyses AMDGPULowerModuleVGPRsPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ return lowerModuleVGPRs(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 376a1ebcc4256..38f53b6365207 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -23,6 +23,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-lower-module-vgprs", AMDGPULowerModuleVGPRsPass())
MODULE_PASS("amdgpu-perf-hint",
AMDGPUPerfHintAnalysisPass(
*static_cast<const GCNTargetMachine *>(this)))
@@ -67,7 +68,6 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
AMDGPUPromoteAllocaToVectorPass(*this))
-FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
FUNCTION_PASS("amdgpu-promote-kernel-arguments",
AMDGPUPromoteKernelArgumentsPass())
FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
@@ -126,6 +126,7 @@ MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass())
MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass())
MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
+MACHINE_FUNCTION_PASS("amdgpu-private-object-vgprs", AMDGPUPrivateObjectVGPRsPass())
MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index a3a1cf6f18bed..8e88cf60e3dbf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -7,30 +7,32 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
-/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
-/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
-/// carries a constant byte offset, which selects the dword (subregister) to
-/// copy.
+/// Lowers the constant-index SI_VGPR_FRAME_{LOAD,STORE} pseudos for "VGPR as
+/// memory" objects (addrspace(13)) into register copies to/from the block of
+/// physical VGPRs backing the file: a load is a COPY from the file register, a
+/// store a COPY to it.
///
-/// This runs once the function is out of SSA form (so the single backing tuple
-/// can be defined by several subregister copies) and while LiveIntervals is
-/// available. The backing tuple has lane-divergent liveness (its subregisters
-/// are written and read independently), which the whole-register LiveVariables
-/// analysis cannot represent; the pass therefore updates the subregister-aware
-/// LiveIntervals directly.
+/// The file is a fixed block of VGPRs (SIRegisterInfo::getVGPRMemoryFile)
+/// reserved out of allocation (getReservedRegs) and counted in the VGPR usage
+/// (AMDGPUResourceUsageAnalysis). It sits just above the ABI inputs at a base
+/// AMDGPULowerModuleVGPRs shares across the call graph (so an address resolves
+/// to the same registers everywhere), low enough to cost only its own size
+/// rather than pinning occupancy. This pass runs after register allocation;
+/// until then the pseudos behave as opaque memory operations, so allocation is
+/// free to use any other register for the surrounding code.
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUPrivateObjectVGPRs.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
using namespace llvm;
@@ -38,13 +40,66 @@ using namespace llvm;
namespace {
-class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+// These two switches must list the same widths as the SI_VGPR_FRAME_{LOAD,
+// STORE}_B* `foreach` in SIInstructions.td.
+static bool isVGPRFrameLoad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B32:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B64:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B96:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B128:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B160:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B192:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B224:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B256:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B288:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B320:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B352:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B384:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B512:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B1024:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool isVGPRFrameStore(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::SI_VGPR_FRAME_STORE_B32:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B64:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B96:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B128:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B160:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B192:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B224:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B256:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B288:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B320:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B352:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B384:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B512:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B1024:
+ return true;
+ default:
+ return false;
+ }
+}
+
+class AMDGPUPrivateObjectVGPRs {
+public:
+ bool run(MachineFunction &MF);
+};
+
+class AMDGPUPrivateObjectVGPRsLegacy : public MachineFunctionPass {
public:
static char ID;
- AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+ AMDGPUPrivateObjectVGPRsLegacy() : MachineFunctionPass(ID) {}
- bool runOnMachineFunction(MachineFunction &MF) override;
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ return AMDGPUPrivateObjectVGPRs().run(MF);
+ }
StringRef getPassName() const override {
return "AMDGPU Private Object VGPRs";
@@ -52,94 +107,146 @@ class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<LiveIntervalsWrapperPass>();
- AU.addPreserved<LiveIntervalsWrapperPass>();
- AU.addPreserved<SlotIndexesWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
} // end anonymous namespace
-INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRsLegacy, DEBUG_TYPE,
"AMDGPU Private Object VGPRs", false, false)
-char AMDGPUPrivateObjectVGPRs::ID = 0;
+char AMDGPUPrivateObjectVGPRsLegacy::ID = 0;
-char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRsLegacy::ID;
-bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+PreservedAnalyses
+AMDGPUPrivateObjectVGPRsPass::run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM) {
+ if (!AMDGPUPrivateObjectVGPRs().run(MF))
+ return PreservedAnalyses::all();
+ return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>();
+}
+
+bool AMDGPUPrivateObjectVGPRs::run(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- // Collect the pseudos and determine how many dwords the backing tuple needs.
- SmallVector<MachineInstr *, 8> Worklist;
- unsigned NumDwords = 0;
+ // The file is a fixed block of reserved physical VGPRs (getVGPRMemoryFile):
+ // exempt from liveness, needing no explicit def, and at the same (shared)
+ // registers across the call graph.
+ auto [BaseIdx, FileDwords] = TRI->getVGPRMemoryFile(MF);
+ if (FileDwords == 0)
+ return false;
+
+ const TargetRegisterClass &VGPR32 = AMDGPU::VGPR_32RegClass;
+
+ // The file lives in low, caller-saved VGPRs. AMDGPULowerModuleVGPRs diagnoses
+ // calls that escape the group at the IR level, but later passes (e.g.
+ // AtomicExpand, CodeGenPrepare) can introduce libcalls, and inline asm naming
+ // a file register is not seen there at all. Both would clobber the file, so
+ // catch them here, now that the reserved registers and machine calls are
+ // final.
+ LLVMContext &Ctx = MF.getFunction().getContext();
+ auto FileOverlaps = [&](Register Reg) {
+ for (unsigned I = 0; I != FileDwords; ++I)
+ if (TRI->regsOverlap(Reg, VGPR32.getRegister(BaseIdx + I)))
+ return true;
+ return false;
+ };
+ auto RegMaskClobbersFile = [&](const MachineOperand &MO) {
+ for (unsigned I = 0; I != FileDwords; ++I)
+ if (MO.clobbersPhysReg(VGPR32.getRegister(BaseIdx + I)))
+ return true;
+ return false;
+ };
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
- unsigned Opc = MI.getOpcode();
- if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
- Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+ if (MI.isInlineAsm()) {
+ // A clobber surfaces either as an explicit physical-register def or, for
+ // some forms, as a register-mask operand; check both.
+ for (const MachineOperand &MO : MI.operands())
+ if ((MO.isReg() && MO.getReg().isPhysical() && MO.isDef() &&
+ FileOverlaps(MO.getReg())) ||
+ (MO.isRegMask() && RegMaskClobbersFile(MO))) {
+ Ctx.diagnose(DiagnosticInfoUnsupported(
+ MF.getFunction(),
+ "inline asm clobbers a 'VGPR as memory' reserved register",
+ MI.getDebugLoc()));
+ break;
+ }
continue;
- unsigned ByteOffset = MI.getOperand(1).getImm();
- NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
- Worklist.push_back(&MI);
+ }
+ // A call clobbers caller-saved VGPRs, including the file, unless the
+ // callee reserves the same file: an in-group member (which carries the
+ // size attribute) or this function itself (self-recursion). Anything else
+ // - an out-of-group/external callee, or an indirect call with no
+ // resolvable callee - does not preserve it. AMDGPULowerModuleVGPRs catches
+ // IR-level escapes; this also covers calls introduced after it (e.g.
+ // expanded libcalls) and indirect machine calls it could not see.
+ if (MI.isCall()) {
+ const MachineOperand *CalleeOp =
+ TII->getNamedOperand(MI, AMDGPU::OpName::callee);
+ const auto *Callee =
+ CalleeOp && CalleeOp->isGlobal()
+ ? dyn_cast<Function>(
+ CalleeOp->getGlobal()->stripPointerCastsAndAliases())
+ : nullptr;
+ if (Callee == &MF.getFunction() ||
+ (Callee && Callee->hasFnAttribute("amdgpu-vgpr-memory-size")))
+ continue;
+ Ctx.diagnose(DiagnosticInfoUnsupported(
+ MF.getFunction(),
+ "call to a function that clobbers the 'VGPR as memory' reserved "
+ "file",
+ MI.getDebugLoc()));
+ }
}
}
- if (Worklist.empty())
- return false;
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
+ unsigned Opc = MI.getOpcode();
+ bool IsLoad = isVGPRFrameLoad(Opc);
+ if (!IsLoad && !isVGPRFrameStore(Opc))
+ continue;
- LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-
- const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
- assert(RC && "no VGPR register class for VGPR-as-memory object");
- Register Storage = MRI.createVirtualRegister(RC);
-
- // Define the whole tuple up front so partial (subregister) writes and reads
- // of uninitialized lanes are well formed.
- MachineBasicBlock &Entry = MF.front();
- MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
- TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
- LIS->InsertMachineInstrInMaps(*ImpDef);
-
- for (MachineInstr *MI : Worklist) {
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
- unsigned Dword = MI->getOperand(1).getImm() / 4;
- unsigned SubReg = NumDwords == 1
- ? AMDGPU::NoSubRegister
- : SIRegisterInfo::getSubRegFromChannel(Dword);
-
- MachineInstr *Copy;
- if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
- Register Dst = MI->getOperand(0).getReg();
- Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
- .addReg(Storage, {}, SubReg);
- } else {
- Register Src = MI->getOperand(0).getReg();
- Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
- .addReg(Storage, RegState::Define, SubReg)
- .addReg(Src);
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Dword = MI.getOperand(1).getImm();
+ Register Data = MI.getOperand(0).getReg();
+ unsigned AccessDwords = TRI->getRegSizeInBits(Data, MRI) / 32;
+
+ // Bounds-checked at pseudo creation (LowerLoadStoreVGPR); never name a
+ // register outside the reserved file.
+ assert(Dword + AccessDwords <= FileDwords &&
+ "VGPR-as-memory access outside the reserved file");
+
+ // Copy the access dword-by-dword between the data (sub)registers and the
+ // file registers. Doing it per dword rather than as one tuple COPY avoids
+ // needing an aligned physical VGPR tuple for the file slice, which can
+ // start on an odd register on targets that require aligned tuples.
+ for (unsigned I = 0; I != AccessDwords; ++I) {
+ MCRegister FileReg = VGPR32.getRegister(BaseIdx + Dword + I);
+ Register DataReg =
+ AccessDwords == 1
+ ? Data
+ : Register(TRI->getSubReg(
+ Data, SIRegisterInfo::getSubRegFromChannel(I)));
+ if (IsLoad)
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), DataReg)
+ .addReg(FileReg);
+ else
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), FileReg)
+ .addReg(DataReg);
+ }
+
+ MI.eraseFromParent();
+ Changed = true;
}
- // The copy takes the pseudo's slot, so the intervals of the copied
- // load/store operand stay valid.
- LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
- MI->eraseFromParent();
}
- // The backing tuple is brand new; compute its (subregister) live interval.
- LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
-
- // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
- // form disconnected value-number components within the single tuple, which an
- // individual live interval must not contain. Split them into separate
- // virtual registers, exactly as the register coalescer does for the intervals
- // it leaves behind.
- SmallVector<LiveInterval *, 4> SplitLIs;
- LIS->splitSeparateComponents(LI, SplitLIs);
-
- return true;
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
new file mode 100644
index 0000000000000..f18b9a608970d
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.h
@@ -0,0 +1,23 @@
+//===- AMDGPUPrivateObjectVGPRs.h -------------------------------*- C++- *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+class AMDGPUPrivateObjectVGPRsPass
+ : public PassInfoMixin<AMDGPUPrivateObjectVGPRsPass> {
+public:
+ PreservedAnalyses run(MachineFunction &MF,
+ MachineFunctionAnalysisManager &MFAM);
+};
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPRIVATEOBJECTVGPRS_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index c587302c3bbae..2223b9d036fa1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,7 +35,6 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -139,7 +138,6 @@ class AMDGPUPromoteAllocaImpl {
unsigned MaxVGPRs;
unsigned VGPRBudgetRatio;
unsigned MaxVectorRegs;
- unsigned AllocVGPROffset = 0;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -164,10 +162,6 @@ class AMDGPUPromoteAllocaImpl {
void analyzePromoteToVector(AllocaAnalysis &AA) const;
void promoteAllocaToVector(AllocaAnalysis &AA);
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
-
- /// Allocate an alloca that already lives in the VGPR address space to a range
- /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
- void allocateVgprs(AllocaAnalysis &AA);
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
SetVector<IntrinsicInst *> &DeferredIntrs);
void
@@ -185,11 +179,7 @@ class AMDGPUPromoteAllocaImpl {
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
}
- /// IsLatePass is true when invoked as a codegen pass and false when invoked
- /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
- /// requests only the work strictly required for functionality (i.e. VGPR
- /// allocation), skipping the optimization-oriented promotions.
- bool run(Function &F, bool IsLatePass, bool NoOpt);
+ bool run(Function &F, bool PromoteToLDS);
};
// FIXME: This can create globals so should be a module pass.
@@ -197,34 +187,26 @@ class AMDGPUPromoteAlloca : public FunctionPass {
public:
static char ID;
- explicit AMDGPUPromoteAlloca(
- CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
- : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
+ AMDGPUPromoteAlloca() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(), *F.getParent(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
- .run(F, /*IsLatePass=*/true, NoOpt);
- }
+ .run(F, /*PromoteToLDS*/ true);
return false;
}
- StringRef getPassName() const override {
- return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
- }
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
-
-private:
- bool NoOpt;
};
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -269,7 +251,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
- .run(F, /*IsLatePass=*/true, /*NoOpt=*/false);
+ .run(F, /*PromoteToLDS=*/true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -282,20 +264,7 @@ PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
- .run(F, /*IsLatePass=*/false, /*NoOpt=*/false);
- if (Changed) {
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
- }
- return PreservedAnalyses::all();
-}
-
-PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, *F.getParent(), LI)
- .run(F, /*IsLatePass=*/true, /*NoOpt=*/true);
+ .run(F, /*PromoteToLDS=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -304,8 +273,8 @@ PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
return PreservedAnalyses::all();
}
-FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
- return new AMDGPUPromoteAlloca(OptLevel);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+ return new AMDGPUPromoteAlloca();
}
bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -398,121 +367,14 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
}
-// A "VGPR as memory" object can only be realized in registers today when every
-// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
-// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses need gfx13 support, which is not yet
-// available; such objects fall back to scratch instead.
-//
-// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
-// support so this fallback is no longer needed.
-static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
- const DataLayout &DL) {
- // An access is lowerable if it covers a whole number of dwords and starts at
- // a dword-aligned constant offset from the alloca.
- auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
- if (!Simple)
- return false;
- uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
- const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
- DL, Off, /*AllowNonInbounds=*/true);
- return Base == &AI && Off.urem(4) == 0;
- };
-
- SmallVector<const Use *, 16> Worklist;
- for (const Use &U : AI.uses())
- Worklist.push_back(&U);
-
- while (!Worklist.empty()) {
- const Use *U = Worklist.pop_back_val();
- const User *Usr = U->getUser();
-
- if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
- if (!GEP->hasAllConstantIndices())
- return false;
- for (const Use &GU : GEP->uses())
- Worklist.push_back(&GU);
- continue;
- }
- if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
- if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
- return false;
- continue;
- }
- if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
- // The pointer must be the address operand, not a stored value (escape).
- if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
- return false;
- if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
- SI->isSimple()))
- return false;
- continue;
- }
- // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
- // otherwise not statically lowerable.
- return false;
- }
- return true;
-}
-
-// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
-// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
-// falls back to ordinary scratch.
-static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
- SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
- for (Use *U : Uses) {
- User *Usr = U->getUser();
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
- IRBuilder<> B(GEP);
- SmallVector<Value *, 4> Indices(GEP->indices());
- Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
- GEP->getName(), GEP->getNoWrapFlags());
- rewriteVGPRPointerToScratch(GEP, NewGEP);
- GEP->eraseFromParent();
- continue;
- }
- if (auto *II = dyn_cast<IntrinsicInst>(Usr);
- II && II->isLifetimeStartOrEnd()) {
- II->eraseFromParent();
- continue;
- }
- // Loads, stores, address-space casts and call arguments only need this
- // operand repointed; their result types do not depend on the operand's
- // address space.
- U->set(New);
- }
-}
-
-static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
- auto *NewAI = new AllocaInst(
- AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
- AI->getAlign(), AI->getName(), AI->getIterator());
- NewAI->setDebugLoc(AI->getDebugLoc());
- rewriteVGPRPointerToScratch(AI, NewAI);
- AI->eraseFromParent();
-}
-
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
- assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
- if (!IsLatePass && DisablePromoteAllocaToVector)
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
+ if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
return false;
- bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
- bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
-
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
MaxVGPRs = IsAMDGCN ? getMaxVGPRs(CurrentLocalMemUsage, TM, F) : 128;
setFunctionLimits(F);
- // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
- // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- const bool TargetSupportsVGPRAsMemory =
- ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
-
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
@@ -529,18 +391,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
AllocaAnalysis AA{AI};
- if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
- // Allocas that already live in the VGPR address space only need to be
- // assigned VGPRs, which is required for functionality.
- if (IsLatePass)
- Allocas.push_back(std::move(AA));
- continue;
- }
- if (!PromoteToVector && !PromoteToLDS)
- continue;
if (collectAllocaUses(AA)) {
- if (PromoteToVector)
- analyzePromoteToVector(AA);
+ analyzePromoteToVector(AA);
if (PromoteToLDS)
analyzePromoteToLDS(AA);
if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -551,15 +403,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
}
}
- stable_sort(Allocas, [](const auto &A, const auto &B) {
- // Prioritize pre-existing VGPR allocas, since their allocation must not
- // fail.
- bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
- bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
- if (AIsVGPR != BIsVGPR)
- return AIsVGPR;
- return A.Score > B.Score;
- });
+ stable_sort(Allocas,
+ [](const auto &A, const auto &B) { return A.Score > B.Score; });
// clang-format off
LLVM_DEBUG(
@@ -572,39 +417,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
bool Changed = false;
SetVector<IntrinsicInst *> DeferredIntrs;
for (AllocaAnalysis &AA : Allocas) {
- if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
- // Fall back to scratch (and warn) when the object can't be kept in
- // registers, so the program still compiles correctly: either the target
- // does not support "VGPR as memory", or the access pattern (dynamic
- // index, sub-dword, escaping address) is not yet supported.
- const char *Unsupported = nullptr;
- if (!TargetSupportsVGPRAsMemory)
- Unsupported = "not supported on this target";
- else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
- Unsupported = "dynamic indexing, sub-dword access, or escaping address "
- "is not yet supported";
- if (Unsupported) {
- F.getContext().diagnose(DiagnosticInfoUnsupported(
- F,
- Twine("'amdgpu_vgpr' object could not be kept in vector registers "
- "(") +
- Unsupported + "); using scratch memory instead",
- AA.Alloca->getDebugLoc(), DS_Warning));
- demoteVGPRAllocaToScratch(AA.Alloca);
- Changed = true;
- continue;
- }
- const unsigned AllocaCost =
- AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
- allocateVgprs(AA);
- // Account for the consumed VGPRs in the vectorization budget.
- if (VectorizationBudget > AllocaCost)
- VectorizationBudget -= AllocaCost;
- else
- VectorizationBudget = 0;
- Changed = true;
- continue;
- }
if (AA.Vector.Ty) {
std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(DL);
assert(Size); // Expected to succeed on non-array alloca.
@@ -639,21 +451,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
return Changed;
}
-void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
- LLVMContext &Ctx = Mod->getContext();
- const unsigned AllocaSize =
- DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
-
- // Record where the object was allocated within the VGPR file.
- Type *I32 = Type::getInt32Ty(Ctx);
- AA.Alloca->setMetadata(
- "amdgpu.allocated.vgprs",
- MDNode::get(
- Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
- ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
- AllocVGPROffset += alignTo(AllocaSize, 4);
-}
-
// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index aab43f23cf606..f41474a2bc031 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -176,12 +176,24 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass,
/*IncludeCalls=*/false);
+ // Reserved "VGPR as memory" file registers aren't "used" but must still be
+ // allocated, so the VGPR count has to cover the highest one.
+ std::pair<unsigned, unsigned> VGPRMemFile = TRI.getVGPRMemoryFile(MF);
+ unsigned VGPRMemBase = VGPRMemFile.first;
+ unsigned VGPRMemCount = VGPRMemFile.second;
+ auto AccountForVGPRMemoryFile = [&](int32_t NumVGPR) -> int32_t {
+ if (VGPRMemCount)
+ NumVGPR = std::max<int32_t>(NumVGPR, VGPRMemBase + VGPRMemCount);
+ return NumVGPR;
+ };
+
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
/*IncludeCalls=*/false);
+ Info.NumVGPR = AccountForVGPRMemoryFile(Info.NumVGPR);
return Info;
}
@@ -319,7 +331,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
}
}
- Info.NumVGPR = MaxVGPR + 1;
+ Info.NumVGPR = AccountForVGPRMemoryFile(MaxVGPR + 1);
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7fc233be91fe0..d6d0c36721fdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -31,6 +31,7 @@
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
#include "AMDGPUPrepareAGPRAlloc.h"
+#include "AMDGPUPrivateObjectVGPRs.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
#include "AMDGPUReserveWWMRegs.h"
#include "AMDGPUResourceUsageAnalysis.h"
@@ -668,7 +669,8 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
- initializeAMDGPUPrivateObjectVGPRsPass(*PR);
+ initializeAMDGPUPrivateObjectVGPRsLegacyPass(*PR);
+ initializeAMDGPULowerModuleVGPRsPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1492,6 +1494,11 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}
+ // Lay out "VGPR as memory" (addrspace(13)) globals into one shared register
+ // file and record its size/base on the participating functions, so it
+ // resolves to the same registers across a kernel's call graph.
+ addPass(createAMDGPULowerModuleVGPRsPass());
+
// Run atomic optimizer before Atomic Expand
if ((TM.getTargetTriple().isAMDGCN()) &&
(TM.getOptLevel() >= CodeGenOptLevel::Less) &&
@@ -1501,12 +1508,9 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAtomicExpandLegacyPass());
- // With optimizations enabled, do the full promotion of allocas. Without
- // optimizations, this only allocates pre-existing VGPR address space allocas,
- // which is required for functionality.
- addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ addPass(createAMDGPUPromoteAlloca());
+
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
@@ -1721,11 +1725,6 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
- // Lower "VGPR as memory" accesses to register copies once out of SSA form.
- // At O0 there is no register coalescer; anchor on TwoAddress, where
- // LiveIntervals is already available.
- insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
-
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
TargetPassConfig::addFastRegAlloc();
@@ -1752,12 +1751,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
- // Lower "VGPR as memory" accesses to register copies once out of SSA form.
- // This runs after the coalescer so it does not perturb the kill flags that
- // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
- // the LiveIntervals the register allocator consumes next.
- insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
-
if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
@@ -1909,6 +1902,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
}
void GCNPassConfig::addPostRegAlloc() {
+ // Lower "VGPR as memory" accesses into copies to/from the reserved VGPR file.
+ // Runs after register allocation (so the file's reserved registers are final)
+ // and before memory-aware post-RA passes (so the pseudos are no longer seen
+ // as memory operations).
+ addPass(&AMDGPUPrivateObjectVGPRsID);
addPass(&SIFixVGPRCopiesID);
if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIOptimizeExecMaskingLegacyID);
@@ -2290,6 +2288,10 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
if (EnableLowerModuleLDS)
addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
+ // Lay out "VGPR as memory" (addrspace(13)) globals into a shared register
+ // file (see the legacy pipeline above for details).
+ addModulePass(AMDGPULowerModuleVGPRsPass(), PMW);
+
// Run atomic optimizer before Atomic Expand
if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
@@ -2298,15 +2300,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
addFunctionPass(AtomicExpandPass(TM), PMW);
- // With optimizations enabled, do the full promotion of allocas. Without
- // optimizations, only allocate pre-existing VGPR address space allocas, which
- // is required for functionality.
- if (TM.getOptLevel() > CodeGenOptLevel::None)
- addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
- else
- addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses(PMW);
@@ -2619,6 +2614,9 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
}
void AMDGPUCodeGenPassBuilder::addPostRegAlloc(PassManagerWrapper &PMW) const {
+ // Lower "VGPR as memory" accesses into copies to/from the reserved VGPR file
+ // (see the legacy GCNPassConfig::addPostRegAlloc for ordering rationale).
+ addMachineFunctionPass(AMDGPUPrivateObjectVGPRsPass(), PMW);
addMachineFunctionPass(SIFixVGPRCopiesPass(), PMW);
if (TM.getOptLevel() > CodeGenOptLevel::None)
addMachineFunctionPass(SIOptimizeExecMaskingPass(), PMW);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dd25ab71997d7..3ca9f5bcc9f9d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPULowerModuleVGPRs.cpp
AMDGPUPrepareAGPRAlloc.cpp
AMDGPULowerExecSync.cpp
AMDGPUSwLowerLDS.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index edb31d77fa510..5c768bcb54d6e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4228,6 +4228,22 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return true;
}
+bool SITargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+ // GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses, so
+ // fall back to SelectionDAG (which does) for any instruction that produces or
+ // consumes such a pointer. TODO: implement the GlobalISel path.
+ auto IsVGPRPtr = [](const Value *V) {
+ Type *Ty = V->getType();
+ return Ty->isPointerTy() && Ty->getPointerAddressSpace() == AMDGPUAS::VGPR;
+ };
+ if (IsVGPRPtr(&Inst))
+ return true;
+ for (const Value *Op : Inst.operands())
+ if (IsVGPRPtr(Op))
+ return true;
+ return false;
+}
+
namespace {
// Chain calls have special arguments that we need to handle. These are
// tagging along at the end of the arguments list(s), after the SGPR and VGPR
@@ -5229,11 +5245,16 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);
- BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
- .addReg(InitReg)
- .addMBB(&OrigBB)
- .addReg(ResultReg)
- .addMBB(&LoopBB);
+ // A zero PhiReg means the caller threads no per-iteration result value
+ // through the loop (e.g. a store whose destination is a fixed physical
+ // register), so the result PHI - and its requirement that ResultReg be
+ // live-out of the loop - is omitted.
+ if (PhiReg)
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(InitReg)
+ .addMBB(&OrigBB)
+ .addReg(ResultReg)
+ .addMBB(&LoopBB);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
.addReg(InitSaveExecReg)
@@ -5595,6 +5616,118 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+// Expand a runtime-index "VGPR as memory" access into an indirect movrel /
+// s_set_gpr_idx read/write of the reserved file (a waterfall loop if the index
+// is divergent), reusing the indirect-vector-element machinery.
+static MachineBasicBlock *emitVGPRFrameDynamic(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const GCNSubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const bool IsLoad = MI.getOpcode() == AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32;
+
+ auto [BaseIdx, Count] = TRI.getVGPRMemoryFile(*MF);
+ assert(Count && "dynamic VGPR-memory access without a reserved file");
+ const TargetRegisterClass *VecRC = TRI.getVGPRClassForBitWidth(Count * 32);
+ assert(VecRC && "dynamic VGPR-memory file has no tuple class; "
+ "LowerLoadStoreVGPR rejects this before creating the pseudo");
+ unsigned VecBits = TRI.getRegSizeInBits(*VecRC);
+ // movrel reads name the base sub-register directly (a subregister index is
+ // not allowed on a physical-register operand), with the whole file tuple as
+ // an implicit use.
+ MCRegister FileReg = TRI.getMatchingSuperReg(
+ AMDGPU::VGPR_32RegClass.getRegister(BaseIdx), AMDGPU::sub0, VecRC);
+ MCRegister FileBaseReg = AMDGPU::VGPR_32RegClass.getRegister(BaseIdx);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+ const bool UseGPRIdxMode = ST.useVGPRIndexMode();
+
+ // Index is file-relative (the constant part was folded in at ISel): base
+ // sub0, no extra offset.
+ unsigned SubReg = AMDGPU::sub0;
+ int Offset = 0;
+
+ // Emit the indexed read/write at InsPt: GPR-idx mode uses IdxReg, movrel mode
+ // uses the preset m0 (IdxReg then unused).
+ auto EmitAccess = [&](MachineBasicBlock &BB,
+ MachineBasicBlock::iterator InsPt, Register IdxReg) {
+ if (IsLoad) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (UseGPRIdxMode)
+ BuildMI(BB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, true), Dst)
+ .addReg(FileReg)
+ .addReg(IdxReg)
+ .addImm(SubReg);
+ else
+ BuildMI(BB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(FileBaseReg)
+ .addReg(FileReg, RegState::Implicit);
+ } else {
+ const MachineOperand *Val =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (UseGPRIdxMode)
+ BuildMI(BB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, false),
+ FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addReg(IdxReg)
+ .addImm(SubReg);
+ else
+ BuildMI(BB, InsPt, DL,
+ TII->getIndirectRegWriteMovRelPseudo(VecBits, 32, false),
+ FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addImm(SubReg);
+ }
+ };
+
+ MachineBasicBlock::iterator I(&MI);
+
+ // Uniform (scalar) index: set up the index in place and emit the access.
+ if (TRI.isSGPRClass(IdxRC)) {
+ Register IdxReg;
+ if (UseGPRIdxMode)
+ IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+ else
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+ EmitAccess(MBB, I, IdxReg);
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ // Divergent (per-lane) index: a waterfall loop covers the lanes sharing each
+ // index. The file is in fixed (reserved) physical registers, so unlike
+ // indirect vector access it is not threaded through a PHI - the per-lane
+ // access reads/writes it in place under EXEC - and a stored value must stay
+ // live across the back-edge.
+ if (!IsLoad)
+ MRI.clearKillFlags(
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg());
+
+ // A load threads its result through the loop; a store threads nothing
+ // (PhiReg == 0 skips the result PHI).
+ Register PhiReg, InitReg;
+ if (IsLoad) {
+ PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+ }
+
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+ EmitAccess(*LoopBB, InsPt, SGPRIdxReg);
+
+ MI.eraseFromParent();
+ return LoopBB;
+}
+
static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,
MachineBasicBlock *BB) {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
@@ -7122,6 +7255,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::SI_INDIRECT_DST_V16:
case AMDGPU::SI_INDIRECT_DST_V32:
return emitIndirectDst(MI, *BB, *getSubtarget());
+ case AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32:
+ case AMDGPU::SI_VGPR_FRAME_DYN_STORE_B32:
+ return emitVGPRFrameDynamic(MI, *BB, *getSubtarget());
case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
case AMDGPU::SI_KILL_I1_PSEUDO:
return splitKillBlock(MI, BB);
@@ -9925,6 +10061,14 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
+// Byte offset of an addrspace(13) global in its file (metadata from
+// AMDGPULowerModuleVGPRs), or nullopt if it was not laid out.
+static std::optional<uint64_t> getVGPRMemoryOffset(const GlobalVariable *GV) {
+ if (MDNode *MD = GV->getMetadata("amdgpu.vgpr.memory.offset"))
+ return mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+ return std::nullopt;
+}
+
SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
SDValue Op,
SelectionDAG &DAG) const {
@@ -9933,6 +10077,30 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
EVT PtrVT = Op.getValueType();
const GlobalValue *GV = GSD->getGlobal();
+
+ // A "VGPR as memory" (addrspace(13)) global has no numeric address; its
+ // "address" is the object's byte offset in the file. Lower it to that
+ // constant so even a standalone materialization (e.g. a constexpr GEP) never
+ // reaches the pc-relative global-address sequence.
+ if (GSD->getAddressSpace() == AMDGPUAS::VGPR) {
+ // The object (resolving aliases) must be a global variable laid out by
+ // AMDGPULowerModuleVGPRs. Diagnose a missing layout here too: this fold runs
+ // before LowerLoadStoreVGPR's own check, so a folded constant base would
+ // otherwise bypass it and silently resolve to offset 0.
+ const auto *GVar = dyn_cast<GlobalVariable>(GV->getAliaseeObject());
+ std::optional<uint64_t> MDOffset =
+ GVar ? getVGPRMemoryOffset(GVar) : std::nullopt;
+ if (!MDOffset) {
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ DAG.getMachineFunction().getFunction(),
+ "unsupported 'VGPR as memory' access: missing "
+ "amdgpu.vgpr.memory.offset layout metadata",
+ DL.getDebugLoc()));
+ return DAG.getPOISON(PtrVT);
+ }
+ return DAG.getConstant(GSD->getOffset() + *MDOffset, DL, PtrVT);
+ }
+
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
shouldUseLDSConstAddress(GV)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
@@ -14332,6 +14500,248 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
+/// Lower a load/store of a "VGPR as memory" object (a global in AMDGPUAS::VGPR)
+/// into an AMDGPUISD::REG_{LOAD,STORE} node carrying the dword index of the
+/// access within the reserved VGPR file. A constant index selects the
+/// SI_VGPR_FRAME_* pseudos (rewritten to register copies by
+/// AMDGPUPrivateObjectVGPRs); a runtime index selects the SI_VGPR_FRAME_DYN_*
+/// pseudos (expanded to an indexed register move). Sub-dword (i8/i16) accesses
+/// are realized as a read-modify-write of the containing dword.
+///
+/// An access this routine cannot handle (e.g. a wider-than-dword dynamic
+/// access, or a base with no layout metadata) is diagnosed and replaced with
+/// poison (load) / its incoming chain (store), so it never reaches instruction
+/// selection as an unselectable memory operation.
+SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MemSDNode *MemOp = cast<MemSDNode>(Op);
+ SDLoc DL(Op);
+
+ // Emit the diagnosis described above (poison for a load, the incoming chain
+ // for a store).
+ auto Unsupported = [&](const Twine &Reason) -> SDValue {
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ DAG.getMachineFunction().getFunction(),
+ "unsupported 'VGPR as memory' access: " + Reason, DL.getDebugLoc()));
+ if (isa<StoreSDNode>(MemOp))
+ return MemOp->getChain();
+ return DAG.getMergeValues(
+ {DAG.getPOISON(MemOp->getValueType(0)), MemOp->getChain()}, DL);
+ };
+
+ // The pointer is a byte offset into the file. After stripping a folded GEP
+ // offset, the base is the addrspace(13) global (offset in metadata), the
+ // constant LowerGlobalAddress folds it to, or a runtime value (dynamic).
+ SDValue Ptr = MemOp->getBasePtr();
+ // Accumulate the byte offset in 64 bits: the addrspace(13) pointer is only
+ // 32-bit, so a folded constant such as a negative GEP index would otherwise
+ // wrap and defeat the out-of-range check below.
+ uint64_t ExtraOffset = 0;
+ SDValue DynByteOffset; // non-constant byte offset, for a runtime index
+ if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1)))
+ ExtraOffset = C->getZExtValue();
+ else
+ DynByteOffset = Ptr.getOperand(1);
+ Ptr = Ptr.getOperand(0);
+ }
+
+ uint64_t ByteOffset = ExtraOffset;
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
+ if (GA->getAddressSpace() != AMDGPUAS::VGPR)
+ return Unsupported(
+ "base is a global outside the VGPR address space (13)");
+ const auto *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+ if (!GV)
+ return Unsupported(
+ "base is not a VGPR address space (13) global variable");
+ std::optional<uint64_t> MDOffset = getVGPRMemoryOffset(GV);
+ if (!MDOffset)
+ return Unsupported("missing amdgpu.vgpr.memory.offset layout metadata");
+ ByteOffset += *MDOffset + GA->getOffset();
+ } else if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
+ ByteOffset += C->getZExtValue();
+ } else {
+ if (DynByteOffset)
+ return Unsupported("two independent dynamic address terms");
+ DynByteOffset = Ptr; // the base is itself a runtime byte offset
+ }
+ EVT MemVT = MemOp->getMemoryVT();
+ unsigned BitWidth = MemVT.getSizeInBits();
+ MachineFunction &MFn = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *MFI = MFn.getInfo<SIMachineFunctionInfo>();
+ unsigned FileBytes = MFI->getVGPRMemorySize();
+ SDValue Chain = MemOp->getChain();
+
+ auto GetDwordMMO = [&](MachineMemOperand::Flags F) {
+ return MFn.getMachineMemOperand(MemOp->getPointerInfo(), F, /*Size=*/4,
+ Align(4));
+ };
+
+ // Lower a sub-dword (8/16-bit) access at dword Index, with the field starting
+ // at bit BitInDword, as a read-modify-write (store) or extract (load) of the
+ // containing dword. Index and BitInDword may be constants - which fold, so
+ // this serves both the constant- and runtime-index paths.
+ auto EmitSubDword = [&](SDValue Index, SDValue BitInDword) -> SDValue {
+ SDValue LowMaskC =
+ DAG.getConstant(maskTrailingOnes<uint32_t>(BitWidth), DL, MVT::i32);
+ SDValue Old = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+ Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val, LowMaskC);
+ Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val, BitInDword);
+ SDValue MaskShifted =
+ DAG.getNode(ISD::SHL, DL, MVT::i32, LowMaskC, BitInDword);
+ SDValue Cleared = DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+ DAG.getNOT(DL, MaskShifted, MVT::i32));
+ SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+ DAG.getVTList(MVT::Other),
+ {Old.getValue(1), New, Index}, MVT::i32,
+ GetDwordMMO(MachineMemOperand::MOStore));
+ }
+ auto *LoadOp = cast<LoadSDNode>(MemOp);
+ bool IsSExt = LoadOp->getExtensionType() == ISD::SEXTLOAD;
+ SDValue Field = DAG.getNode(ISD::SRL, DL, MVT::i32, Old, BitInDword);
+ if (IsSExt)
+ Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+ DAG.getValueType(MemVT));
+ else
+ Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field, LowMaskC);
+ // Narrow/extend i32 Field to the result type per the load's extension kind.
+ EVT ResVT = LoadOp->getValueType(0);
+ SDValue Result = IsSExt ? DAG.getSExtOrTrunc(Field, DL, ResVT)
+ : DAG.getZExtOrTrunc(Field, DL, ResVT);
+ return DAG.getMergeValues({Result, Old.getValue(1)}, DL);
+ };
+
+ // Runtime index. Sub-dword (8/16-bit) accesses RMW the containing dword
+ // (race-free: VGPRs are per-lane).
+ if (DynByteOffset) {
+ if (BitWidth != 8 && BitWidth != 16 && BitWidth != 32)
+ return Unsupported("dynamic index wider than 32 bits");
+ if (!FileBytes)
+ return Unsupported("dynamic access to an empty VGPR-memory file");
+ // The dynamic index move treats the whole file as one indexed tuple, so the
+ // file's (even-dword-rounded) size must have a VGPR tuple class.
+ unsigned FileDwords = divideCeil(FileBytes, 4u);
+ if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(
+ AMDGPU::getVGPRMemoryFileDwords(FileBytes) * 32))
+ return Unsupported("VGPR-memory file too large for a dynamic index");
+ // The address is a 32-bit addrspace(13) pointer, so the byte offset is
+ // computed in i32: any wrap is the defined behavior of that pointer width,
+ // and the UMIN clamp below bounds the resulting dword index into the file
+ // regardless. (The constant-index path uses 64-bit arithmetic instead,
+ // because it must statically range-check rather than clamp.)
+ SDValue DynI32 = DAG.getZExtOrTrunc(DynByteOffset, DL, MVT::i32);
+ SDValue Bytes = DAG.getNode(ISD::ADD, DL, MVT::i32, DynI32,
+ DAG.getConstant(ByteOffset, DL, MVT::i32));
+ SDValue Index = DAG.getNode(ISD::SRL, DL, MVT::i32, Bytes,
+ DAG.getConstant(2, DL, MVT::i32));
+
+ // Clamp the dword index into the file so an out-of-range dynamic access
+ // disturbs only the file's own last register, not arbitrary live VGPRs.
+ Index = DAG.getNode(ISD::UMIN, DL, MVT::i32, Index,
+ DAG.getConstant(FileDwords - 1, DL, MVT::i32));
+
+ if (BitWidth == 8 || BitWidth == 16) {
+ // The RMW assumes the field stays within one dword, which holds only for
+ // a naturally aligned access; an underaligned one could cross a boundary
+ // at runtime, so reject it rather than silently drop the high bits.
+ if (MemOp->getAlign() < Align(BitWidth / 8))
+ return Unsupported("underaligned sub-dword dynamic access");
+
+ SDValue ByteInDword = DAG.getNode(ISD::AND, DL, MVT::i32, Bytes,
+ DAG.getConstant(3, DL, MVT::i32));
+ SDValue BitInDword = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteInDword,
+ DAG.getConstant(3, DL, MVT::i32));
+ return EmitSubDword(Index, BitInDword);
+ }
+
+ // Whole-dword dynamic access: both the constant and runtime parts must be
+ // dword-aligned so the index shift does not silently round down.
+ if (ByteOffset % 4 != 0 || MemOp->getAlign() < Align(4))
+ return Unsupported("misaligned 32-bit dynamic access");
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Val = DAG.getBitcast(MVT::i32, StoreOp->getValue());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+ DAG.getVTList(MVT::Other),
+ {Chain, Val, Index}, MVT::i32,
+ GetDwordMMO(MachineMemOperand::MOStore));
+ }
+ auto *LoadOp = cast<LoadSDNode>(MemOp);
+ if (LoadOp->getExtensionType() != ISD::NON_EXTLOAD)
+ return Unsupported("extending 32-bit dynamic load");
+ SDValue Ld = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+ EVT ResVT = LoadOp->getValueType(0);
+ SDValue Res = ResVT == MVT::i32 ? Ld : DAG.getBitcast(ResVT, Ld);
+ return DAG.getMergeValues({Res, Ld.getValue(1)}, DL);
+ }
+
+ // A statically out-of-range constant index would select physical registers
+ // outside the reserved file. It is undefined behavior; diagnose it rather
+ // than miscompile into a copy to/from an arbitrary live VGPR.
+ if (ByteOffset + BitWidth / 8 > FileBytes)
+ return Unsupported("constant index out of range");
+
+ // Sub-dword (8/16-bit) constant-index access. Registers have no sub-dword
+ // addressing, so extract from (load) or RMW (store) the containing dword.
+ if (BitWidth == 8 || BitWidth == 16) {
+ unsigned BitInDword = (ByteOffset % 4) * 8;
+ if (BitInDword + BitWidth > 32)
+ return Unsupported("sub-dword field crosses a dword boundary");
+ return EmitSubDword(DAG.getConstant(ByteOffset / 4, DL, MVT::i32),
+ DAG.getConstant(BitInDword, DL, MVT::i32));
+ }
+
+ // Whole-dword accesses.
+ if (ByteOffset % 4 != 0)
+ return Unsupported("misaligned multi-dword access");
+ if (BitWidth == 0 || BitWidth % 32 != 0)
+ return Unsupported("access is not a whole number of dwords");
+ if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
+ return Unsupported("access wider than the largest VGPR tuple");
+
+ if (auto *Load = dyn_cast<LoadSDNode>(MemOp)) {
+ if (Load->getExtensionType() != ISD::NON_EXTLOAD)
+ return Unsupported("extending multi-dword load");
+ } else if (cast<StoreSDNode>(MemOp)->isTruncatingStore()) {
+ return Unsupported("truncating multi-dword store");
+ }
+
+ // View the access as i32 / <N x i32> so one node covers it; bitcast when the
+ // memory type is not register legal.
+ EVT RegVT = MemVT;
+ if (!isTypeLegal(RegVT)) {
+ unsigned NumDwords = BitWidth / 32;
+ RegVT = NumDwords == 1
+ ? EVT(MVT::i32)
+ : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumDwords);
+ }
+
+ SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Value = StoreOp->getValue();
+ if (RegVT != MemVT)
+ Value = DAG.getNode(ISD::BITCAST, DL, RegVT, Value);
+ return DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_STORE, DL, DAG.getVTList(MVT::Other),
+ {Chain, Value, Index}, MemVT, StoreOp->getMemOperand());
+ }
+
+ SDValue NewLoad = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(RegVT, MVT::Other), {Chain, Index},
+ MemVT, MemOp->getMemOperand());
+ if (RegVT == MemVT)
+ return NewLoad;
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, MemVT, NewLoad);
+ return DAG.getMergeValues({Value, NewLoad.getValue(1)}, DL);
+}
+
/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
/// by the chain and intrinsic ID. Theoretically we would also need to check the
/// specific intrinsic, but they all place the pointer operand first.
@@ -18569,6 +18979,19 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ // Lower "VGPR as memory" (addrspace(13)) accesses into AMDGPUISD::REG_*. This
+ // is mandatory lowering, but it is done here rather than in LowerOperation
+ // because it must apply to a load/store of *any* value type (including legal
+ // scalars like i32, which are never custom-lowered), and the address space
+ // cannot be expressed in setOperationAction. It is scoped to addrspace(13)
+ // nodes (so ordinary memory is untouched) and runs first in PerformDAGCombine
+ // and replaces the node, so no other combine preempts it.
+ unsigned Opc = N->getOpcode();
+ if ((Opc == ISD::LOAD || Opc == ISD::STORE) &&
+ cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::VGPR)
+ if (SDValue V = LowerLoadStoreVGPR(SDValue(N, 0), DCI.DAG))
+ return V;
+
switch (N->getOpcode()) {
case ISD::ABS:
if (SDValue Res = promoteUniformUnaryOpToI32(SDValue(N, 0), DCI))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c98426cdac0b1..aa1b11e3c4c68 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -125,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLoadStoreVGPR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const;
@@ -453,6 +454,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool mayBeEmittedAsTailCall(const CallInst *) const override;
+ bool fallBackToDAGISel(const Instruction &Inst) const override;
+
bool isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c30e53e9b4e4..35303c881955c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -59,6 +59,17 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.G
// modifier behavior with dx10_enable.
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+// "VGPR as memory" (addrspace(13)) load/store carrying a dword index into the
+// reserved VGPR file; selected into the SI_VGPR_FRAME_* pseudos.
+def SDTRegIdxLoad : SDTypeProfile<1, 1,
+ [SDTCisVT<1, i32>]>; // dword_index
+def SDTRegIdxStore : SDTypeProfile<0, 2,
+ [SDTCisVT<1, i32>]>; // data, dword_index
+def SIreg_load : SDNode<"AMDGPUISD::REG_LOAD", SDTRegIdxLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def SIreg_store : SDNode<"AMDGPUISD::REG_STORE", SDTRegIdxStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
def SDTSBufferLoad : SDTypeProfile<1, 3,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3594caef86782..a8877ab258a8e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,24 +1243,65 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
}
} // End Spill = 1, VALU = 1, isConvergent = 1
-// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
-// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
-// within the per-function VGPR file. They are produced during instruction
-// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
-// pass before register allocation.
+// "VGPR as memory" (addrspace(13)) accesses: load/store of a VGPR tuple at a
+// constant dword index in the reserved file. Selected from AMDGPUISD::REG_*
+// (constant index) and rewritten to copies by AMDGPUPrivateObjectVGPRs.
let hasSideEffects = 0 in {
-def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins i32imm:$offset)> {
- let mayLoad = 1;
- let mayStore = 0;
+foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
+ VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
+ VReg_512, VReg_1024] in {
+ def SI_VGPR_FRAME_LOAD_B#rc.Size : VPseudoInstSI <
+ (outs rc:$vdst), (ins i32imm:$idx)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ }
+ def SI_VGPR_FRAME_STORE_B#rc.Size : VPseudoInstSI <
+ (outs), (ins rc:$vdata, i32imm:$idx)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ }
}
+} // End hasSideEffects = 0
-def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
- (ins VGPR_32:$vdata, i32imm:$offset)> {
- let mayLoad = 0;
- let mayStore = 1;
+// Same, at a *runtime* dword index ($idx, a VS_32). The custom inserter expands
+// these into an indirect read/write of the file (movrel / s_set_gpr_idx, with a
+// waterfall loop for a divergent index). Only 32-bit accesses for now.
+let usesCustomInserter = 1, hasSideEffects = 0, UseNamedOperandTable = 1 in {
+ def SI_VGPR_FRAME_DYN_LOAD_B32 : VPseudoInstSI <
+ (outs VGPR_32:$vdst), (ins VS_32:$idx)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ }
+ def SI_VGPR_FRAME_DYN_STORE_B32 : VPseudoInstSI <
+ (outs), (ins VGPR_32:$vdata, VS_32:$idx)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ }
}
-} // End hasSideEffects = 0
+
+// Constant dword index -> width-matched frame pseudo.
+multiclass VGPRFrameLoadStorePat<ValueType vt> {
+ defvar load_inst = !cast<Instruction>("SI_VGPR_FRAME_LOAD_B"#vt.Size);
+ defvar store_inst = !cast<Instruction>("SI_VGPR_FRAME_STORE_B"#vt.Size);
+ def : GCNPat<(vt (SIreg_load (i32 imm:$idx))), (load_inst imm:$idx)>;
+ def : GCNPat<(SIreg_store vt:$data, (i32 imm:$idx)),
+ (store_inst $data, imm:$idx)>;
+}
+
+// Non-constant dword index -> dynamic pseudo. Lower complexity than the
+// constant patterns, so a constant index still prefers them.
+foreach vt = Reg32Types.types in {
+ def : GCNPat<(vt (SIreg_load i32:$idx)), (SI_VGPR_FRAME_DYN_LOAD_B32 $idx)>;
+ def : GCNPat<(SIreg_store vt:$data, i32:$idx),
+ (SI_VGPR_FRAME_DYN_STORE_B32 $data, $idx)>;
+}
+
+foreach vt = !listconcat(
+ Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
+ Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
+ Reg288Types.types, Reg320Types.types, Reg352Types.types, Reg384Types.types,
+ Reg512Types.types, Reg1024Types.types) in
+defm : VGPRFrameLoadStorePat<vt>;
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4be4ce28e6de5..13648e813488d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -183,6 +183,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
"amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
+ // "VGPR as memory" file layout from AMDGPULowerModuleVGPRs (~0u base = none).
+ VGPRMemorySize =
+ F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-size", 0);
+ VGPRMemoryBase =
+ F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-base", ~0u);
+
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
// RA, shift it to the lowest available unused VGPR if the one exist.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 1f43505650222..4d799da91c7f0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -587,6 +587,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
// the serialization easier.
ReservedRegSet WWMReservedRegs;
+ // "VGPR as memory" (addrspace(13)) file assigned by AMDGPULowerModuleVGPRs:
+ // size in bytes and the shared base register index (~0u = none). Reserved out
+ // of allocation for the whole function, like LDS; offsets come from metadata.
+ unsigned VGPRMemorySize = 0;
+ unsigned VGPRMemoryBase = ~0u;
+
bool IsWholeWaveFunction = false;
using PrologEpilogSGPRSpill =
@@ -690,6 +696,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
+ // "VGPR as memory" file size in bytes (0 if none) and shared base register.
+ unsigned getVGPRMemorySize() const { return VGPRMemorySize; }
+ unsigned getVGPRMemoryBase() const { return VGPRMemoryBase; }
+
bool isWWMReservedRegister(Register Reg) const {
return WWMReservedRegs.contains(Reg);
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9700720f0373a..94098c8d863ba 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -584,6 +584,45 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
+std::pair<unsigned, unsigned>
+SIRegisterInfo::getVGPRMemoryFile(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned Bytes = MFI->getVGPRMemorySize();
+ if (!Bytes)
+ return {0, 0};
+
+ // Even number of dwords so wide (>=64-bit) tuple accesses start on an aligned
+ // register on targets that require aligned VGPR tuples.
+ unsigned Dwords = AMDGPU::getVGPRMemoryFileDwords(Bytes);
+
+ // The base is assigned module-wide by AMDGPULowerModuleVGPRs (identical
+ // across the call graph, so an address resolves to the same register
+ // everywhere).
+ unsigned BaseIdx = MFI->getVGPRMemoryBase();
+ assert(BaseIdx != ~0u && "VGPR-memory size set without a base");
+
+ // The file [BaseIdx, BaseIdx + Dwords) must not overlap any VGPR ABI input.
+ // A small file sits below the work-item-ID register; a larger one is placed
+ // above it by the module pass. Verify no overlap remains rather than risk
+ // silently clobbering an input.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (const auto &LI : MRI.liveins()) {
+ MCRegister Reg = LI.first;
+ const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+ if (!RC || !isVGPRClass(RC))
+ continue;
+ unsigned Start = getHWRegIndex(Reg);
+ unsigned End = Start + getRegSizeInBits(*RC) / 32u;
+ if (BaseIdx < End && Start < BaseIdx + Dwords)
+ report_fatal_error("VGPR-as-memory file overlaps a VGPR ABI input");
+ }
+
+ assert(BaseIdx + Dwords <=
+ ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) &&
+ "VGPR-as-memory file does not fit");
+ return {BaseIdx, Dwords};
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
@@ -747,6 +786,13 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (Register Reg : MFI->getWWMReservedRegs())
reserveRegisterTuples(Reserved, Reg);
+ // Reserve the registers backing "VGPR as memory" (addrspace(13)) objects
+ // (see getVGPRMemoryFile).
+ auto [VGPRMemBase, VGPRMemCount] = getVGPRMemoryFile(MF);
+ for (unsigned I = 0; I != VGPRMemCount; ++I)
+ reserveRegisterTuples(Reserved,
+ AMDGPU::VGPR_32RegClass.getRegister(VGPRMemBase + I));
+
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..1d7fcee791af1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -96,6 +96,14 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
+ /// The "VGPR as memory" (addrspace(13)) register file: a fixed block of
+ /// physical VGPRs reserved for the whole function, placed (like LDS) at a
+ /// location consistent across the call graph. Returns the VGPR_32 index of
+ /// the first file register and the dword register count, or {0, 0} if the
+ /// function has no such objects.
+ std::pair<unsigned, unsigned>
+ getVGPRMemoryFile(const MachineFunction &MF) const;
+
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7528cd2a009a3..96571dd028b14 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,7 +18,6 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
@@ -1780,17 +1779,6 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
return false;
}
-AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
- const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
- assert(MD && MD->getNumOperands() == 2 &&
- "expected !amdgpu.allocated.vgprs metadata with 2 operands");
- unsigned Address =
- mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
- unsigned Size =
- mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
- return {Address, Size};
-}
-
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 923c5c3a988fd..d74cc3da3593e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -31,7 +31,6 @@ struct amd_kernel_code_t;
namespace llvm {
struct Align;
-class AllocaInst;
class Argument;
class Function;
class GlobalValue;
@@ -50,6 +49,14 @@ namespace AMDGPU {
struct AMDGPUMCKernelCodeT;
struct IsaVersion;
+/// Number of (even-aligned) dword registers a "VGPR as memory" (addrspace(13))
+/// file of \p Bytes bytes occupies. Single-sources the size shared by the
+/// module layout pass, register reservation, and instruction selection.
+inline unsigned getVGPRMemoryFileDwords(unsigned Bytes) {
+ unsigned Dwords = (Bytes + 3u) / 4u; // divideCeil(Bytes, 4)
+ return (Dwords + 1u) & ~1u; // alignTo(Dwords, 2)
+}
+
/// Generic target versions emitted by this version of LLVM.
///
/// These numbers are incremented every time a codegen breaking change occurs
@@ -1038,16 +1045,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
-/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
-/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
-/// to within the VGPR file, and its size in bytes.
-struct AllocatedVGPRsMetadata {
- unsigned Address;
- unsigned Size;
-
- static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
-};
-
// The following methods are only meaningful on targets that support
// S_WAITCNT.
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..67365cdc38b88 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -273,8 +273,10 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// (address space 7), and 128-bit non-integral buffer resourcees (address
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
+ // Address space 13 ("VGPR as memory") uses 32-bit register-relative indices.
return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
+ "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:"
+ "64-"
"v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
deleted file mode 100644
index f6c64c5121867..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
-
-define void @vgpr_alloca() {
-; CHECK-LABEL: define void @vgpr_alloca(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- ret void
-}
-
-define void @vgpr_alloca_multiple() {
-; CHECK-LABEL: define void @vgpr_alloca_multiple(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
-; CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[B]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca i32, align 4, addrspace(13)
- %b = alloca [2 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- store i32 0, ptr addrspace(13) %b
- ret void
-}
-
-define void @private_alloca_unchanged() {
-; CHECK-LABEL: define void @private_alloca_unchanged(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT: store i64 42, ptr addrspace(5) [[A]], align 8
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i64], align 4, addrspace(5)
- store i64 42, ptr addrspace(5) %a
- ret void
-}
-
-declare void @use(ptr)
-
-; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
-; back to ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
-; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
-; CHECK-NEXT: store i32 [[V]], ptr addrspace(5) [[P2]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
- store i32 %v, ptr addrspace(13) %p
- ret void
-}
-
-; A VGPR object whose address escapes (here via a cast to a generic pointer, as
-; the frontend emits) cannot be kept in registers yet, so it falls back to
-; ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_escaping() {
-; CHECK-LABEL: define void @vgpr_alloca_escaping(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
-; CHECK-NEXT: call void @use(ptr [[CAST]])
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- %cast = addrspacecast ptr addrspace(13) %a to ptr
- call void @use(ptr %cast)
- ret void
-}
-
-; Whole-dword-multiple accesses (here i64) stay in VGPRs.
-define void @vgpr_alloca_i64(i64 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_i64(
-; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
-; CHECK-NEXT: store i64 [[V]], ptr addrspace(13) [[A]], align 8
-; CHECK-NEXT: ret void
-;
- %a = alloca i64, align 8, addrspace(13)
- store i64 %v, ptr addrspace(13) %a
- ret void
-}
-
-; Sub-dword accesses are not supported yet, so the object falls back to scratch.
-define void @vgpr_alloca_subdword(i16 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_subdword(
-; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
-; CHECK-NEXT: store i16 [[V]], ptr addrspace(5) [[A1]], align 2
-; CHECK-NEXT: ret void
-;
- %a = alloca [2 x i16], align 4, addrspace(13)
- store i16 %v, ptr addrspace(13) %a
- ret void
-}
-;.
-; CHECK: [[META0]] = !{i32 0, i32 16}
-; CHECK: [[META1]] = !{i32 0, i32 4}
-; CHECK: [[META2]] = !{i32 4, i32 8}
-; CHECK: [[META3]] = !{i32 0, i32 8}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
deleted file mode 100644
index 63ba44b479279..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
-; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
-; scratch.
-
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-
-define void @vgpr_obj() {
-; SUPP: alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
-; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
- %a = alloca [4 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
deleted file mode 100644
index ea914907a900d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
-
-; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
-; indices must lower to register copies, never to scratch/buffer memory traffic.
-
-; CHECK-LABEL: store_load_i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_i32(i32 %v) {
- %a = alloca i32, align 4, addrspace(13)
- store i32 %v, ptr addrspace(13) %a
- %l = load i32, ptr addrspace(13) %a
- %r = add i32 %l, 1
- ret i32 %r
-}
-
-; CHECK-LABEL: store_load_array:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_array(i32 %v) {
- %a = alloca [4 x i32], align 4, addrspace(13)
- %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
- %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
- store i32 %v, ptr addrspace(13) %p1
- store i32 7, ptr addrspace(13) %p3
- %l1 = load i32, ptr addrspace(13) %p1
- %l3 = load i32, ptr addrspace(13) %p3
- %s = add i32 %l1, %l3
- ret i32 %s
-}
-
-; A 64-bit (two-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_i64:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i64 @store_load_i64(i64 %v) {
- %a = alloca i64, align 8, addrspace(13)
- store i64 %v, ptr addrspace(13) %a
- %l = load i64, ptr addrspace(13) %a
- %r = add i64 %l, 1
- ret i64 %r
-}
-
-; A vector (four-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_v4i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
- %a = alloca <4 x i32>, align 16, addrspace(13)
- store <4 x i32> %v, ptr addrspace(13) %a
- %l = load <4 x i32>, ptr addrspace(13) %a
- ret <4 x i32> %l
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 0dbabd2991bc4..13db81d89e43d 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -27,9 +27,9 @@
; GCN-O0-NEXT: amdgpu-lower-exec-sync
; GCN-O0-NEXT: amdgpu-sw-lower-lds
; GCN-O0-NEXT: amdgpu-lower-module-lds
+; GCN-O0-NEXT: amdgpu-lower-module-vgprs
; GCN-O0-NEXT: function
; GCN-O0-NEXT: atomic-expand
-; GCN-O0-NEXT: amdgpu-vgpr-allocate
; GCN-O0-NEXT: verify
; GCN-O0-NEXT: unreachableblockelim
; GCN-O0-NEXT: ee-instrument<post-inline>
@@ -81,6 +81,7 @@
; GCN-O0-NEXT: si-lower-wwm-copies
; GCN-O0-NEXT: amdgpu-reserve-wwm-regs
; GCN-O0-NEXT: regallocfast<filter=vgpr>
+; GCN-O0-NEXT: amdgpu-private-object-vgprs
; GCN-O0-NEXT: si-fix-vgpr-copies
; GCN-O0-NEXT: remove-redundant-debug-values
; GCN-O0-NEXT: fixup-statepoint-caller-saved
@@ -129,6 +130,7 @@
; GCN-O2-NEXT: amdgpu-lower-exec-sync
; GCN-O2-NEXT: amdgpu-sw-lower-lds
; GCN-O2-NEXT: amdgpu-lower-module-lds
+; GCN-O2-NEXT: amdgpu-lower-module-vgprs
; GCN-O2-NEXT: function
; GCN-O2-NEXT: amdgpu-atomic-optimizer
; GCN-O2-NEXT: atomic-expand
@@ -253,6 +255,7 @@
; GCN-O2-NEXT: stack-slot-coloring
; GCN-O2-NEXT: machine-cp
; GCN-O2-NEXT: machinelicm
+; GCN-O2-NEXT: amdgpu-private-object-vgprs
; GCN-O2-NEXT: si-fix-vgpr-copies
; GCN-O2-NEXT: si-optimize-exec-masking
; GCN-O2-NEXT: remove-redundant-debug-values
@@ -315,6 +318,7 @@
; GCN-O3-NEXT: amdgpu-lower-exec-sync
; GCN-O3-NEXT: amdgpu-sw-lower-lds
; GCN-O3-NEXT: amdgpu-lower-module-lds
+; GCN-O3-NEXT: amdgpu-lower-module-vgprs
; GCN-O3-NEXT: function
; GCN-O3-NEXT: amdgpu-atomic-optimizer
; GCN-O3-NEXT: atomic-expand
@@ -439,6 +443,7 @@
; GCN-O3-NEXT: stack-slot-coloring
; GCN-O3-NEXT: machine-cp
; GCN-O3-NEXT: machinelicm
+; GCN-O3-NEXT: amdgpu-private-object-vgprs
; GCN-O3-NEXT: si-fix-vgpr-copies
; GCN-O3-NEXT: si-optimize-exec-masking
; GCN-O3-NEXT: remove-redundant-debug-values
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index aabfadd33e976..960cbb1a0def2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,15 +47,14 @@
; GCN-O0-NEXT: AMDGPU lowering of execution synchronization
; GCN-O0-NEXT: AMDGPU Software lowering of LDS
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O0-NEXT: AMDGPU Lower Module VGPRs
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand Atomic instructions
-; GCN-O0-NEXT: Dominator Tree Construction
-; GCN-O0-NEXT: Natural Loop Information
-; GCN-O0-NEXT: AMDGPU VGPR Allocate
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
+; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: AMDGPU lower intrinsics
@@ -117,7 +116,6 @@
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
-; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
@@ -132,6 +130,7 @@
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: AMDGPU Reserve WWM Registers
; GCN-O0-NEXT: Fast Register Allocator
+; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Fix VGPR copies
; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis
; GCN-O0-NEXT: Fixup Statepoint Caller Saved
@@ -210,6 +209,7 @@
; GCN-O1-NEXT: AMDGPU lowering of execution synchronization
; GCN-O1-NEXT: AMDGPU Software lowering of LDS
; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O1-NEXT: AMDGPU Lower Module VGPRs
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Cycle Info Analysis
@@ -362,7 +362,6 @@
; GCN-O1-NEXT: Live Interval Analysis
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Register Coalescer
-; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: Rename Disconnected Subregister Components
; GCN-O1-NEXT: Rewrite Partial Register Uses
; GCN-O1-NEXT: Machine Instruction Scheduler
@@ -402,6 +401,7 @@
; GCN-O1-NEXT: Stack Slot Coloring
; GCN-O1-NEXT: Machine Copy Propagation Pass
; GCN-O1-NEXT: Machine Loop Invariant Code Motion
+; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: SI Fix VGPR copies
; GCN-O1-NEXT: SI optimize exec mask operations
; GCN-O1-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -502,6 +502,7 @@
; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization
; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS
; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O1-OPTS-NEXT: AMDGPU Lower Module VGPRs
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
@@ -680,7 +681,6 @@
; GCN-O1-OPTS-NEXT: Live Interval Analysis
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Register Coalescer
-; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
; GCN-O1-OPTS-NEXT: Rewrite Partial Register Uses
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
@@ -721,6 +721,7 @@
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
; GCN-O1-OPTS-NEXT: Machine Copy Propagation Pass
; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion
+; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: SI Fix VGPR copies
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations
; GCN-O1-OPTS-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -822,6 +823,7 @@
; GCN-O2-NEXT: AMDGPU lowering of execution synchronization
; GCN-O2-NEXT: AMDGPU Software lowering of LDS
; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O2-NEXT: AMDGPU Lower Module VGPRs
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Cycle Info Analysis
@@ -1003,7 +1005,6 @@
; GCN-O2-NEXT: Live Interval Analysis
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Register Coalescer
-; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: Rename Disconnected Subregister Components
; GCN-O2-NEXT: Rewrite Partial Register Uses
; GCN-O2-NEXT: Machine Instruction Scheduler
@@ -1045,6 +1046,7 @@
; GCN-O2-NEXT: Stack Slot Coloring
; GCN-O2-NEXT: Machine Copy Propagation Pass
; GCN-O2-NEXT: Machine Loop Invariant Code Motion
+; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: SI Fix VGPR copies
; GCN-O2-NEXT: SI optimize exec mask operations
; GCN-O2-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -1146,6 +1148,7 @@
; GCN-O3-NEXT: AMDGPU lowering of execution synchronization
; GCN-O3-NEXT: AMDGPU Software lowering of LDS
; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O3-NEXT: AMDGPU Lower Module VGPRs
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Cycle Info Analysis
@@ -1340,7 +1343,6 @@
; GCN-O3-NEXT: Live Interval Analysis
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Register Coalescer
-; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: Rename Disconnected Subregister Components
; GCN-O3-NEXT: Rewrite Partial Register Uses
; GCN-O3-NEXT: Machine Instruction Scheduler
@@ -1382,6 +1384,7 @@
; GCN-O3-NEXT: Stack Slot Coloring
; GCN-O3-NEXT: Machine Copy Propagation Pass
; GCN-O3-NEXT: Machine Loop Invariant Code Motion
+; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: SI Fix VGPR copies
; GCN-O3-NEXT: SI optimize exec mask operations
; GCN-O3-NEXT: Remove Redundant DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 79c11fb2a7c37..4f340c6938ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -55,7 +55,7 @@
@nullptr12 = global ptr addrspace(12) addrspacecast (ptr null to ptr addrspace(12))
; CHECK-LABEL: nullptr13:
-; R600-NEXT: .long 0
+; CHECK-NEXT: .long -1
@nullptr13 = global ptr addrspace(13) addrspacecast (ptr null to ptr addrspace(13))
; CHECK-LABEL: nullptr14:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index fc5dabc584863..1a73c35f83f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -49,6 +49,7 @@
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: AMDGPU Reserve WWM Registers
; O0-NEXT: Fast Register Allocator
+; O0-NEXT: AMDGPU Private Object VGPRs
; O0-NEXT: SI Fix VGPR copies
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
new file mode 100644
index 0000000000000..50efa3936c365
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-addrspacecast.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; An addrspacecast to or from the "VGPR as memory" address space (13) has no
+; meaningful numeric translation, so it is allowed but lowers to poison rather
+; than being rejected. ptrtoint/inttoptr are also IR-legal; materializing such a
+; pointer on its own (as below) has no defined value, but an actual memory
+; access through an inttoptr value is a real dynamic-indexed access - see
+; @dyn_inttoptr in vgpr-as-memory-dynamic.ll.
+
+ at g = internal addrspace(13) global i32 poison
+
+define ptr @cast_to_generic() {
+; CHECK-LABEL: cast_to_generic:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %c = addrspacecast ptr addrspace(13) @g to ptr
+ ret ptr %c
+}
+
+define ptr addrspace(13) @cast_to_vgpr(ptr %p) {
+; CHECK-LABEL: cast_to_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %c = addrspacecast ptr %p to ptr addrspace(13)
+ ret ptr addrspace(13) %c
+}
+
+define ptr addrspace(13) @inttoptr_vgpr(i32 %x) {
+; CHECK-LABEL: inttoptr_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = inttoptr i32 %x to ptr addrspace(13)
+ ret ptr addrspace(13) %p
+}
+
+define i32 @ptrtoint_vgpr(ptr addrspace(13) %p) {
+; CHECK-LABEL: ptrtoint_vgpr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %i = ptrtoint ptr addrspace(13) %p to i32
+ ret i32 %i
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
new file mode 100644
index 0000000000000..4b90e28d51235
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-callgraph.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A device function that uses the work-item ID keeps it in the fixed high
+; register v31, while the shared "VGPR as memory" file sits at a low base. The
+; two must not be confused: the file stays below v31 (it does not have to clear
+; it), and the access still lowers to a register copy.
+
+ at g = internal addrspace(13) global i32 poison
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define void @dev() #0 {
+; CHECK-LABEL: dev:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v31
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %id = call i32 @llvm.amdgcn.workitem.id.x()
+ store i32 %id, ptr addrspace(13) @g
+ ret void
+}
+
+define amdgpu_kernel void @k() {
+; CHECK-LABEL: k:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s12, s8
+; CHECK-NEXT: s_add_u32 s8, s4, 36
+; CHECK-NEXT: s_mov_b32 s13, s9
+; CHECK-NEXT: s_addc_u32 s9, s5, 0
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_add_u32 s4, s4, dev at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s5, s5, dev at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; CHECK-NEXT: s_mov_b32 s14, s10
+; CHECK-NEXT: s_mov_b64 s[10:11], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[4:5], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[2:3]
+; CHECK-NEXT: v_mov_b32_e32 v31, v0
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; CHECK-NEXT: s_endpgm
+ call void @dev()
+ ret void
+}
+
+attributes #0 = { noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
new file mode 100644
index 0000000000000..fb763cd31e339
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O2
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A "VGPR as memory" access through a constant-expression GEP must lower to a
+; register copy, not the pc-relative global-address sequence (which previously
+; crashed because addrspace(13) pointers are 32-bit). Exercised at -O0 too,
+; where the address is materialized standalone rather than folded.
+
+ at buf = internal addrspace(13) global [4 x i32] poison
+
+define void @store_constexpr_gep(i32 %v) {
+; O0-LABEL: store_constexpr_gep:
+; O0: ; %bb.0:
+; O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT: v_mov_b32_e32 v4, v0
+; O0-NEXT: s_setpc_b64 s[30:31]
+;
+; O2-LABEL: store_constexpr_gep:
+; O2: ; %bb.0:
+; O2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT: v_mov_b32_e32 v4, v0
+; O2-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+ ret void
+}
+
+define i32 @load_constexpr_gep() {
+; O0-LABEL: load_constexpr_gep:
+; O0: ; %bb.0:
+; O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT: v_mov_b32_e32 v0, v4
+; O0-NEXT: s_setpc_b64 s[30:31]
+;
+; O2-LABEL: load_constexpr_gep:
+; O2: ; %bb.0:
+; O2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT: v_mov_b32_e32 v0, v4
+; O2-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+ ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
new file mode 100644
index 0000000000000..97a58b4d16436
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
@@ -0,0 +1,346 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s -o /dev/null
+
+; A runtime (non-constant) index into a "VGPR as memory" object becomes an
+; indexed move into the reserved VGPR file: s_set_gpr_idx on gfx9, movrel on
+; gfx10+, with a waterfall loop for a divergent index.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+ at buf8 = internal addrspace(13) global [16 x i8] poison
+ at buf16 = internal addrspace(13) global [16 x i16] poison
+
+define amdgpu_kernel void @dyn_uniform(ptr addrspace(1) %out, i32 %i, i32 %v) {
+; GFX942-LABEL: dyn_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_andn2_b32 s2, s2, -2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, s3
+; GFX942-NEXT: s_min_u32 s2, s2, 15
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: global_store_dword v0, v18, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_and_not1_b32 s2, s2, -2.0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 m0, s2, 15
+; GFX11-NEXT: v_movreld_b32_e32 v2, s3
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %i
+ store i32 %v, ptr addrspace(13) %p
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_load(ptr addrspace(1) %out) {
+; GFX942-LABEL: dyn_divergent_load:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_min_u32_e32 v0, 15, v0
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB1_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v1, v18, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_load:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v1, 15, v1
+; GFX11-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v1
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s3
+; GFX11-NEXT: v_movrels_b32_e32 v18, v2
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v18, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_store(ptr addrspace(1) %out, i32 %v) {
+; GFX942-LABEL: dyn_divergent_store:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_min_u32_e32 v0, 15, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-NEXT: s_mov_b64 s[0:1], exec
+; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB2_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_store:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v0, 15, v0
+; GFX11-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_movreld_b32_e32 v2, s0
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+; Sub-dword (i8/i16) at a runtime index: the containing dword is read-modify-
+; written with the bit position computed at runtime.
+define amdgpu_kernel void @dyn_i8_uniform(ptr addrspace(1) %out, i32 %i, i8 %v) {
+; GFX942-LABEL: dyn_i8_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_and_b32 s5, s2, 3
+; GFX942-NEXT: s_and_b32 s4, s3, 0xff
+; GFX942-NEXT: s_lshr_b32 s2, s2, 2
+; GFX942-NEXT: s_lshl_b32 s5, s5, 3
+; GFX942-NEXT: s_lshl_b32 s4, s4, s5
+; GFX942-NEXT: s_lshl_b32 s5, 0xff, s5
+; GFX942-NEXT: s_min_u32 s2, s2, 3
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: v_not_b32_e32 v6, s5
+; GFX942-NEXT: v_and_b32_e32 v1, v1, v6
+; GFX942-NEXT: v_or_b32_e32 v1, s4, v1
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_i8_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s4, s2, 3
+; GFX11-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_min_u32 m0, s2, 3
+; GFX11-NEXT: s_lshl_b32 s2, 0xff, s4
+; GFX11-NEXT: v_movrels_b32_e32 v0, v2
+; GFX11-NEXT: v_not_b32_e32 v1, s2
+; GFX11-NEXT: s_and_b32 s2, s3, 0xff
+; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX11-NEXT: global_store_b8 v1, v6, s[0:1]
+; GFX11-NEXT: v_movreld_b32_e32 v2, v0
+; GFX11-NEXT: s_endpgm
+ %p = getelementptr [16 x i8], ptr addrspace(13) @buf8, i32 0, i32 %i
+ store i8 %v, ptr addrspace(13) %p
+ %l = load i8, ptr addrspace(13) %p
+ store i8 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_i16_divergent(ptr addrspace(1) %out, i16 %v) {
+; GFX942-LABEL: dyn_i16_divergent:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
+; GFX942-NEXT: v_lshlrev_b32_e32 v10, 4, v10
+; GFX942-NEXT: s_mov_b32 s2, 0xffff
+; GFX942-NEXT: v_bfe_u32 v0, v0, 1, 9
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_lshlrev_b32_e64 v11, v10, s2
+; GFX942-NEXT: v_min_u32_e32 v0, 7, v0
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB4_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: v_bfi_b32 v11, v11, 0, v12
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_and_b32 s2, s6, 0xffff
+; GFX942-NEXT: v_lshl_or_b32 v10, s2, v10, v11
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v10
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB4_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_i16_divergent:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 4, v1
+; GFX11-NEXT: v_bfe_u32 v1, v0, 1, 9
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_lshlrev_b32_e64 v11, v10, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT: v_min_u32_e32 v1, 7, v1
+; GFX11-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v1
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s4
+; GFX11-NEXT: v_movrels_b32_e32 v12, v2
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v11, v11, 0, v12
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_lshl_or_b32 v10, s3, v10, v11
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v1
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s4
+; GFX11-NEXT: v_movreld_b32_e32 v2, v10
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB4_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i16], ptr addrspace(13) @buf16, i32 0, i32 %tid
+ store i16 %v, ptr addrspace(13) %p
+ %l = load i16, ptr addrspace(13) %p
+ store i16 %l, ptr addrspace(1) %out
+ ret void
+}
+
+; An addrspace(13) pointer built with inttoptr is not poison: the integer is the
+; register-relative byte offset, so the access is lowered as a dynamic index
+; (clamped) like any other runtime index. A direct reference to @buf reserves
+; the file for this function.
+define amdgpu_kernel void @dyn_inttoptr(ptr addrspace(1) %out, i32 %off, i32 %v) {
+; GFX942-LABEL: dyn_inttoptr:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: v_mov_b32_e32 v2, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_lshr_b32 s2, s2, 2
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: s_min_u32 s2, s2, 15
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_inttoptr:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-NEXT: s_min_u32 m0, s2, 15
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_movreld_b32_e32 v2, s3
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ store i32 0, ptr addrspace(13) @buf
+ %p = inttoptr i32 %off to ptr addrspace(13)
+ store i32 %v, ptr addrspace(13) %p
+ %l = load i32, ptr addrspace(13) @buf
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
new file mode 100644
index 0000000000000..78f22f55ea68e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-const-oob.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A compile-time index past the end of the "VGPR as memory" file is out of range
+; (it would otherwise select physical registers outside the reserved file), so
+; it is diagnosed rather than miscompiled.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: constant index out of range
+define amdgpu_kernel void @const_oob() {
+ %p = getelementptr i32, ptr addrspace(13) @buf, i32 1000
+ %v = load i32, ptr addrspace(13) %p
+ store i32 %v, ptr addrspace(13) @buf
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
new file mode 100644
index 0000000000000..514ddce90ae9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-dynamic-toolarge.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A dynamic index addresses the whole "VGPR as memory" file as one indexed
+; tuple. A file whose (even-dword-rounded) size has no VGPR tuple class - e.g.
+; 14 dwords - is diagnosed rather than aborting the compiler.
+
+ at buf = internal addrspace(13) global [14 x i32] poison
+
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: VGPR-memory file too large for a dynamic index
+define amdgpu_kernel void @dynamic_toolarge(i32 %i) {
+ %p = getelementptr i32, ptr addrspace(13) @buf, i32 %i
+ %v = load i32, ptr addrspace(13) %p
+ store i32 %v, ptr addrspace(13) @buf
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
new file mode 100644
index 0000000000000..2329336b60553
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-extern-call.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; The "VGPR as memory" file lives in low, caller-saved VGPRs that only call-graph
+; members reserve. A call to an external (or indirect) callee would clobber it,
+; so AMDGPULowerModuleVGPRs diagnoses it at the IR level, and the post-RA
+; AMDGPUPrivateObjectVGPRs pass independently diagnoses the (attribute-less)
+; machine call - this also covers calls introduced after the module pass.
+
+ at g = internal addrspace(13) global i32 poison
+
+declare void @ext()
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+; CHECK: error: {{.*}}call to a function that clobbers the 'VGPR as memory' reserved file
+define amdgpu_kernel void @extern_call() {
+ store i32 1, ptr addrspace(13) @g
+ call void @ext()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
new file mode 100644
index 0000000000000..52b38ab0e9255
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-indirect-call.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; An indirect call cannot be proven to stay within the call graph that reserves
+; the "VGPR as memory" file, so it could clobber the file. AMDGPULowerModuleVGPRs
+; diagnoses it (the callee is unknown at the IR level).
+
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+; CHECK: error: {{.*}}call to a function that clobbers the 'VGPR as memory' reserved file
+define amdgpu_kernel void @indirect_call(ptr %fp) {
+ store i32 1, ptr addrspace(13) @g
+ call void %fp()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
new file mode 100644
index 0000000000000..8dd5890b6170d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-inlineasm-clobber.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; The "VGPR as memory" file is a block of reserved physical VGPRs. Inline asm
+; that explicitly clobbers one of those registers would corrupt the file, so
+; AMDGPUPrivateObjectVGPRs diagnoses it after register allocation, where the
+; reserved registers are final. (For this function the file is at v2.)
+
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: error: {{.*}}inline asm clobbers a 'VGPR as memory' reserved register
+define void @asm_clobber(i32 %v) {
+ store i32 %v, ptr addrspace(13) @g
+ call void asm sideeffect "", "~{v2}"()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
new file mode 100644
index 0000000000000..06d30cebb03c6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-ungrouped-call.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; A file-using device function that is not reached from any kernel forms a group
+; on its own; a call to a defined function outside that group would clobber the
+; file's reserved registers, so it is diagnosed (not just external/indirect
+; calls).
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @other() {
+ ret void
+}
+
+; CHECK: error: {{.*}}'VGPR as memory' is not supported in a function that makes an indirect call or a call outside its call graph
+define void @dev_user() {
+ store i32 1, ptr addrspace(13) @g
+ call void @other()
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
new file mode 100644
index 0000000000000..4fe434f6a2d18
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported-more.ll
@@ -0,0 +1,32 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; Additional "VGPR as memory" accesses the backend cannot lower, each diagnosed
+; rather than reaching instruction selection.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+
+; A dynamic sub-dword access must be naturally aligned so the read-modify-write
+; stays within one dword.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: underaligned sub-dword dynamic access
+define void @underaligned_dyn_subdword(i32 %i, i16 %v) {
+ %p = getelementptr i16, ptr addrspace(13) @buf, i32 %i
+ store i16 %v, ptr addrspace(13) %p, align 1
+ ret void
+}
+
+; A dynamic whole-dword access must be dword aligned.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: misaligned 32-bit dynamic access
+define void @misaligned_dyn_dword(i32 %i, i32 %v) {
+ %p = getelementptr i8, ptr addrspace(13) @buf, i32 %i
+ %p2 = getelementptr i8, ptr addrspace(13) %p, i32 2
+ store i32 %v, ptr addrspace(13) %p2, align 4
+ ret void
+}
+
+; A constant sub-dword field must not straddle a dword boundary.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: sub-dword field crosses a dword boundary
+define i16 @const_subdword_crosses_dword() {
+ %p = getelementptr i8, ptr addrspace(13) @buf, i32 3
+ %v = load i16, ptr addrspace(13) %p, align 1
+ ret i16 %v
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
new file mode 100644
index 0000000000000..7b1b011252846
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-error-unsupported.ll
@@ -0,0 +1,15 @@
+; RUN: not llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck %s
+
+; "VGPR as memory" accesses that the backend cannot lower are diagnosed instead
+; of reaching instruction selection as unselectable memory operations.
+
+ at buf = internal addrspace(13) global [16 x i64] poison
+
+; A dynamic index into a wider-than-dword element is unsupported.
+; CHECK: error: {{.*}}unsupported 'VGPR as memory' access: dynamic index wider than 32 bits
+define amdgpu_kernel void @wide_dynamic(i32 %i) {
+ %p = getelementptr i64, ptr addrspace(13) @buf, i32 %i
+ %v = load i64, ptr addrspace(13) %p
+ store i64 %v, ptr addrspace(13) @buf
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
new file mode 100644
index 0000000000000..da0ff1a1a30b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-function-ref.ll
@@ -0,0 +1,18 @@
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; A "VGPR as memory" global referenced only from an ordinary (non-kernel)
+; function - as IPO might leave after outlining code from a kernel - is still
+; laid out, and the referencing function is annotated. The backend handles
+; direct references to the global from any function, independently of the
+; frontend's placement rules.
+
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset
+ at g = internal addrspace(13) global i32 poison
+
+; CHECK: define void @user(i32 %v) #[[ATTR:[0-9]+]]
+define void @user(i32 %v) {
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+; CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-vgpr-memory-base"="{{[0-9]+}}"{{.*}}"amdgpu-vgpr-memory-size"="4"
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
new file mode 100644
index 0000000000000..0dc6dbca45480
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+
+; GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses;
+; fallBackToDAGISel makes such functions fall back to SelectionDAG, which lowers
+; them to register copies rather than crashing in reg-bank legalization.
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
new file mode 100644
index 0000000000000..6da6f49a9e082
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; AMDGPULowerModuleVGPRs lays out addrspace(13) globals into per-call-graph
+; groups: disjoint kernels (@k1/@a, @k2/@b) get independent layouts and bases,
+; while functions that share a global (@writer/@reader/@g, reached from @k3)
+; share one consistent group, so the address resolves to the same registers.
+
+ at a = internal addrspace(13) global [4 x i32] poison
+ at b = internal addrspace(13) global [8 x i32] poison
+ at g = internal addrspace(13) global i32 poison
+
+;.
+; CHECK: @a = internal addrspace(13) global [4 x i32] poison, !amdgpu.vgpr.memory.offset [[META0:![0-9]+]]
+; CHECK: @b = internal addrspace(13) global [8 x i32] poison, !amdgpu.vgpr.memory.offset [[META0]]
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset [[META0]]
+;.
+define amdgpu_kernel void @k1(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT: [[P:%.*]] = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT: store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %p = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @k2(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k2(
+; CHECK-NEXT: [[P:%.*]] = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT: store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %p = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define void @writer(i32 %v) {
+; CHECK-LABEL: @writer(
+; CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(13) @g, align 4
+; CHECK-NEXT: ret void
+;
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @reader() {
+; CHECK-LABEL: @reader(
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) @g, align 4
+; CHECK-NEXT: ret i32 [[L]]
+;
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
+
+define amdgpu_kernel void @k3(ptr addrspace(1) %out, i32 %v) {
+; CHECK-LABEL: @k3(
+; CHECK-NEXT: call void @writer(i32 [[V:%.*]])
+; CHECK-NEXT: [[R:%.*]] = call i32 @reader()
+; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ call void @writer(i32 %v)
+ %r = call i32 @reader()
+ store i32 %r, ptr addrspace(1) %out
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="16" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="32" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="4" }
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
new file mode 100644
index 0000000000000..44193d15016f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; Sub-dword (i8/i16) "VGPR as memory" accesses at a constant index are realized
+; as a read-modify-write of the containing dword (shifts and masks), since
+; registers have no sub-dword addressing.
+
+ at b = internal addrspace(13) global [8 x i8] poison
+ at h = internal addrspace(13) global [4 x i16] poison
+
+define void @store_i8(i8 %v) {
+; CHECK-LABEL: store_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_and_b32_e32 v1, 0xffff00ff, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT: v_lshl_or_b32 v0, v0, 8, v1
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+ store i8 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define i8 @load_i8() {
+; CHECK-LABEL: load_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: v_bfe_u32 v0, v0, 8, 8
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+ %l = load i8, ptr addrspace(13) %p
+ ret i8 %l
+}
+
+define void @store_i16(i16 %v) {
+; CHECK-LABEL: store_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+ store i16 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define signext i16 @load_i16_sext() {
+; CHECK-LABEL: load_i16_sext:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+ %l = load i16, ptr addrspace(13) %p
+ ret i16 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
new file mode 100644
index 0000000000000..3c7347c0baaae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" (addrspace(13)) accesses at a constant index lower to plain
+; register copies to/from the reserved VGPR file - never to scratch or buffer
+; memory - and writer/reader of the same global resolve to the same register.
+
+ at g = internal addrspace(13) global i32 poison
+ at arr = internal addrspace(13) global [4 x i32] poison
+ at g64 = internal addrspace(13) global i64 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
+
+define void @store_arr(i32 %v) {
+; CHECK-LABEL: store_arr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define i32 @load_arr() {
+; CHECK-LABEL: load_arr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+ %l = load i32, ptr addrspace(13) %p
+ ret i32 %l
+}
+
+define void @store_i64(i64 %v) {
+; CHECK-LABEL: store_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: v_mov_b32_e32 v3, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i64 %v, ptr addrspace(13) @g64
+ ret void
+}
+
+define i64 @load_i64() {
+; CHECK-LABEL: load_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: v_mov_b32_e32 v1, v3
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i64, ptr addrspace(13) @g64
+ ret i64 %l
+}
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index bd760de79c9d0..3ca15083959ad 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,24 +2,26 @@
target triple = "amdgcn-amd-amdhsa"
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: %alloca.13 = alloca i32, align 4, addrspace(13)
define void @static_alloca() {
entry:
%alloca.0 = alloca i32, align 4
@@ -36,23 +38,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
define void @dynamic_alloca_i32(i32 %n) {
entry:
@@ -69,23 +71,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
define void @dynamic_alloca_i64(i64 %n) {
entry:
diff --git a/llvm/test/Verifier/AMDGPU/vgpr-memory.ll b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
new file mode 100644
index 0000000000000..406f77ca17599
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
@@ -0,0 +1,49 @@
+; RUN: not llvm-as %s --disable-output 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; A "VGPR as memory" global is register-backed: it has no defined initial
+; contents and per-lane storage, so it cannot be statically initialized or
+; atomically accessed. An addrspacecast to/from addrspace(13) is allowed but
+; lowers to poison (it has no meaningful numeric address), and likewise
+; ptrtoint/inttoptr are allowed, so neither is diagnosed here.
+
+; CHECK: atomic operations on the VGPR address space (13) are not allowed
+; CHECK-NEXT: atomicrmw add ptr addrspace(13) @valid.poison
+; CHECK: atomic operations on the VGPR address space (13) are not allowed
+; CHECK-NEXT: %v = load atomic i32, ptr addrspace(13) @valid.poison
+; CHECK: intrinsic with a VGPR address space (13) pointer argument is not allowed
+; CHECK-NEXT: call void @llvm.memcpy
+; CHECK: intrinsic with a VGPR address space (13) pointer argument is not allowed
+; CHECK-NEXT: call void @llvm.memset
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.init
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.zeroinit
+
+; A poison initializer (or none) is fine.
+ at valid.poison = internal addrspace(13) global i32 poison
+ at valid.array = internal addrspace(13) global [4 x i32] poison
+
+ at bad.init = internal addrspace(13) global i32 7
+ at bad.zeroinit = internal addrspace(13) global [2 x i32] zeroinitializer
+
+define void @atomic_rmw() {
+ atomicrmw add ptr addrspace(13) @valid.poison, i32 1 seq_cst
+ ret void
+}
+
+define i32 @atomic_load() {
+ %v = load atomic i32, ptr addrspace(13) @valid.poison seq_cst, align 4
+ ret i32 %v
+}
+
+define void @memcpy_vgpr(ptr %src) {
+ call void @llvm.memcpy.p13.p0.i64(ptr addrspace(13) @valid.poison, ptr %src, i64 16, i1 false)
+ ret void
+}
+
+define void @memset_vgpr() {
+ call void @llvm.memset.p13.i64(ptr addrspace(13) @valid.poison, i8 0, i64 16, i1 false)
+ ret void
+}
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index a082adbf6565e..0ec3c753c10f1 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -43,14 +43,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
// and that ANDGCN adds p7 and p8 as well.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
"m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
"m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// Check that the old AMDGCN p8:128:128 definition is upgraded
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
"m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// but that r600 does not.
EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
"m:e-e-p:32:32-G1");
@@ -66,7 +66,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
"64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
"1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
- "128:48-p9:192:256:256:32");
+ "128:48-p9:192:256:256:32-p13:32:32");
// Check that SystemZ adds -S64 if needed.
EXPECT_EQ(UpgradeDataLayoutString(
@@ -158,24 +158,24 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
"m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
"m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
"m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// Check that AMDGCN targets don't add already declared address space 7.
- EXPECT_EQ(
- UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
- "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
- EXPECT_EQ(
- UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
- "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
- EXPECT_EQ(
- UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
- "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
+ EXPECT_EQ(UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
+ "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
// Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
// flag.
@@ -216,9 +216,9 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
// Check that AMDGPU targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
- EXPECT_EQ(
- UpgradeDataLayoutString("", "amdgcn"),
- "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+ EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
+ "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
// Check that SPIR & SPIRV targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
More information about the cfe-commits
mailing list