[clang] [llvm] [AMDGPU] Add initial support for VGPR as memory (PR #205435)
Gheorghe-Teodor Bercea via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 26 09:01:14 PDT 2026
https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/205435
>From 587f5182a78051d248866b3d8995a5c43ab54878 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 19 Jun 2026 14:05:08 -0500
Subject: [PATCH 1/3] Add initial support for VGPR as memory
---
clang/include/clang/Basic/Attr.td | 8 +
clang/include/clang/Basic/AttrDocs.td | 20 ++
.../clang/Basic/DiagnosticCommonKinds.td | 5 +
.../clang/Basic/DiagnosticSemaKinds.td | 3 +
clang/include/clang/Sema/SemaAMDGPU.h | 1 +
clang/lib/CodeGen/CGDecl.cpp | 41 ++-
clang/lib/Sema/SemaAMDGPU.cpp | 14 ++
clang/lib/Sema/SemaDeclAttr.cpp | 3 +
.../CodeGenHIP/amdgpu-vgpr-O0-warning.hip | 14 ++
clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip | 19 ++
...a-attribute-supported-attributes-list.test | 1 +
clang/test/SemaCUDA/amdgpu-vgpr.cu | 28 +++
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 4 +
llvm/lib/IR/VerifierAMDGPU.cpp | 6 +-
llvm/lib/Target/AMDGPU/AMDGPU.h | 16 +-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 168 +++++++++++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 +
.../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp | 145 +++++++++++
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 233 ++++++++++++++++--
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 28 ++-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 19 ++
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 +
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 11 +
.../AMDGPU/amdgpu-vgpr-allocate-basic.ll | 109 ++++++++
.../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll | 20 ++
.../CodeGen/AMDGPU/as-vgpr-alloca-static.ll | 58 +++++
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 1 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 9 +-
llvm/test/Verifier/AMDGPU/alloca.ll | 55 +++--
31 files changed, 985 insertions(+), 69 deletions(-)
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
create mode 100644 clang/test/SemaCUDA/amdgpu-vgpr.cu
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index f1ae66bd7f2bb..51a91a432957d 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2521,6 +2521,14 @@ def AMDGPUMaxNumWorkGroups : InheritableAttr {
let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
}
+def AMDGPUVGPR : InheritableAttr {
+ let Spellings = [Clang<"amdgpu_vgpr">];
+ let Documentation = [AMDGPUVGPRDocs];
+ let Subjects = SubjectList<[LocalVar], ErrorDiag>;
+ // Only meaningful in CUDA/HIP; semantic checks restrict it to kernel locals.
+ let LangOpts = [CUDA];
+}
+
def BPFPreserveAccessIndex : InheritableAttr,
TargetSpecificAttr<TargetBPF> {
let Spellings = [Clang<"preserve_access_index">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 7c1c88241aaa8..b80265a1aec1d 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3604,6 +3604,26 @@ An error will be given if:
}];
}
+def AMDGPUVGPRDocs : Documentation {
+ let Category = DocCatAMDGPUAttributes;
+ let Content = [{
+This attribute requests that a kernel-local variable be allocated in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
+so that accesses with statically known indices lower to vector register
+copies instead of scratch memory traffic.
+
+Clang supports the ``__attribute__((amdgpu_vgpr))`` or
+``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
+local variables declared in a ``__global__`` (kernel) function; applying it to
+a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
+error.
+
+Known limitation: the request is only honored with optimizations enabled. At
+``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
+emitted.
+ }];
+}
+
def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> {
let Content = [{
Clang supports several different calling conventions, depending on the target
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..fe03be43c80c7 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,6 +319,11 @@ def warn_stack_protection_ignore_attribute : Warning<
"'stack_protector_ignore' attribute ignored due to "
"'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
+def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
+ "%0 is not guaranteed to keep the variable in vector registers at -O0; "
+ "it may fall back to scratch memory">,
+ InGroup<DiagGroup<"amdgpu-vgpr">>;
+
def warn_slh_does_not_support_asm_goto : Warning<
"speculative load hardening does not protect functions with asm goto">,
InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index cde99dfb16ec5..9d52492b8ce64 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3710,6 +3710,9 @@ def err_attribute_argument_invalid : Error<
def err_attribute_amdgpu_flat_work_group_size_mismatch : Error<
"'amdgpu_flat_work_group_size' attribute must match "
"'reqd_work_group_size' product">;
+def err_amdgpu_vgpr_not_kernel_local : Error<
+ "%0 attribute can only be applied to local variables in "
+ "'__global__' (kernel) functions">;
def err_attribute_argument_is_zero : Error<
"%0 attribute must be greater than 0">;
def warn_attribute_argument_n_negative : Warning<
diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h
index a6205534e0de3..9cb74ed74f4b9 100644
--- a/clang/include/clang/Sema/SemaAMDGPU.h
+++ b/clang/include/clang/Sema/SemaAMDGPU.h
@@ -79,6 +79,7 @@ class SemaAMDGPU : public SemaBase {
void handleAMDGPUNumVGPRAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUMaxNumWorkGroupsAttr(Decl *D, const ParsedAttr &AL);
void handleAMDGPUFlatWorkGroupSizeAttr(Decl *D, const ParsedAttr &AL);
+ void handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL);
/// Expand a valid use of the feature identification builtins into its
/// corresponding sequence of instructions.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 7608f8cb6fc7a..bca2d11d47c6a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -41,6 +41,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Type.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include <optional>
using namespace clang;
@@ -1601,9 +1602,37 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// Create the alloca. Note that we set the name separately from
// building the instruction so that it's there even in no-asserts
// builds.
- address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
- allocaAlignment, D.getName(),
- /*ArraySize=*/nullptr, &AllocaAddr);
+ //
+ // "VGPR as memory" objects keep their backing registers only once the
+ // optimizing register allocator runs. At -O0 the backend cannot lower
+ // these accesses (e.g. when the address escapes a basic block), so the
+ // request is not honored: fall back to an ordinary (scratch) alloca and
+ // warn, matching the documented behavior.
+ // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
+ // backing tuple to scratch) so this fallback can be removed.
+ const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
+ const bool UseVGPRMemory =
+ VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
+ if (VGPRAttr && !UseVGPRMemory)
+ CGM.getDiags().Report(D.getLocation(),
+ diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
+ << VGPRAttr;
+
+ if (UseVGPRMemory) {
+ // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
+ // address space so that statically indexed accesses lower to vector
+ // register copies instead of scratch memory.
+ auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
+ /*ArraySize=*/nullptr, D.getName(),
+ AllocaInsertPt->getIterator());
+ AI->setAlignment(allocaAlignment.getAsAlign());
+ AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+ address = AllocaAddr;
+ } else {
+ address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
+ allocaAlignment, D.getName(),
+ /*ArraySize=*/nullptr, &AllocaAddr);
+ }
// Don't emit lifetime markers for MSVC catch parameters. The lifetime of
// the catch parameter starts in the catchpad instruction, and we can't
@@ -1612,8 +1641,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
// Emit a lifetime intrinsic if meaningful. There's no point in doing this
- // if we don't have a valid insertion point (?).
- if (HaveInsertPoint() && !IsMSCatchParam) {
+ // if we don't have a valid insertion point (?). "VGPR as memory" allocas
+ // live in a non-alloca address space, so the standard lifetime markers
+ // (which assume the alloca address space) are skipped for them.
+ if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
// If there's a jump into the lifetime of this variable, its lifetime
// gets broken up into several regions in IR, which requires more work
// to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index 29442617b6a13..b741b9e7f1e24 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "clang/Sema/SemaAMDGPU.h"
+#include "clang/AST/Attr.h"
#include "clang/AST/Decl.h"
#include "clang/AST/DynamicRecursiveASTVisitor.h"
#include "clang/AST/Expr.h"
@@ -626,6 +627,19 @@ void SemaAMDGPU::handleAMDGPUFlatWorkGroupSizeAttr(Decl *D,
addAMDGPUFlatWorkGroupSizeAttr(D, AL, MinExpr, MaxExpr);
}
+void SemaAMDGPU::handleAMDGPUVGPRAttr(Decl *D, const ParsedAttr &AL) {
+ // The LocalVar subject list already guarantees this is a local variable.
+ // Restrict it further to locals declared directly in a __global__ kernel;
+ // it is meaningless (and an error) in __device__ or host functions.
+ const auto *FD = dyn_cast<FunctionDecl>(D->getDeclContext());
+ if (!FD || !FD->hasAttr<CUDAGlobalAttr>()) {
+ Diag(AL.getLoc(), diag::err_amdgpu_vgpr_not_kernel_local) << AL;
+ return;
+ }
+
+ D->addAttr(::new (getASTContext()) AMDGPUVGPRAttr(getASTContext(), AL));
+}
+
static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
Expr *MaxExpr,
const AMDGPUWavesPerEUAttr &Attr) {
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2159c586e5738..095a11acdd02d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7641,6 +7641,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_AMDGPUNumVGPR:
S.AMDGPU().handleAMDGPUNumVGPRAttr(D, AL);
break;
+ case ParsedAttr::AT_AMDGPUVGPR:
+ S.AMDGPU().handleAMDGPUVGPRAttr(D, AL);
+ break;
case ParsedAttr::AT_AMDGPUMaxNumWorkGroups:
S.AMDGPU().handleAMDGPUMaxNumWorkGroupsAttr(D, AL);
break;
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
new file mode 100644
index 0000000000000..4d23008b8ef43
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// At -O0 "VGPR as memory" is not honored: the variable falls back to an
+// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
+
+#define __global__ __attribute__((global))
+
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
new file mode 100644
index 0000000000000..9a5c38e48951c
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -emit-llvm -O1 -disable-llvm-passes -o - %s \
+// RUN: | FileCheck %s
+
+#define __global__ __attribute__((global))
+
+// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
+// memory" address space (addrspace(13)), and its accesses stay in that space.
+
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
+// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
+// CHECK: store i32 %{{.*}}, ptr addrspace(13)
+// CHECK: load i32, ptr addrspace(13)
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr));
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 03b9a77ec1814..69cc257aa3120 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -7,6 +7,7 @@
// CHECK-NEXT: AMDGPUMaxNumWorkGroups (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumSGPR (SubjectMatchRule_function)
// CHECK-NEXT: AMDGPUNumVGPR (SubjectMatchRule_function)
+// CHECK-NEXT: AMDGPUVGPR (SubjectMatchRule_variable_is_local)
// CHECK-NEXT: AMDGPUWavesPerEU (SubjectMatchRule_function)
// CHECK-NEXT: AVRSignal (SubjectMatchRule_function)
// CHECK-NEXT: AbiTag (SubjectMatchRule_record_not_is_union, SubjectMatchRule_variable, SubjectMatchRule_function, SubjectMatchRule_namespace)
diff --git a/clang/test/SemaCUDA/amdgpu-vgpr.cu b/clang/test/SemaCUDA/amdgpu-vgpr.cu
new file mode 100644
index 0000000000000..6ad3074921b9b
--- /dev/null
+++ b/clang/test/SemaCUDA/amdgpu-vgpr.cu
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+__global__ void kernel() {
+ int ok[4] __attribute__((amdgpu_vgpr)); // OK
+ (void)ok;
+}
+
+__device__ void device_fn() {
+ int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+ (void)bad;
+}
+
+__host__ void host_fn() {
+ int bad __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute can only be applied to local variables in '__global__' (kernel) functions}}
+ (void)bad;
+}
+
+// Not a local variable.
+int global_var __attribute__((amdgpu_vgpr)); // expected-error {{'amdgpu_vgpr' attribute only applies to local variables}}
+
+__global__ void takes_no_args() {
+ // Attribute does not accept arguments.
+ int bad __attribute__((amdgpu_vgpr(1))); // expected-error {{'amdgpu_vgpr' attribute takes no arguments}}
+ (void)bad;
+}
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index 01b1510524d0f..e9d3add54d054 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -47,6 +47,10 @@ enum : unsigned {
BUFFER_STRIDED_POINTER = 9, ///< Address space for 192-bit fat buffer
///< pointers with an additional index.
+ VGPR = 13, ///< Address space for "VGPR as memory": objects backed by VGPRs
+ ///< rather than scratch. Shares its numeric value with the
+ ///< graphics-only CONSTANT_BUFFER_5 alias below.
+
RESERVED_ADDRESS_SPACE_16 = 16, ///< Reserved for downstream use.
/// Internal address spaces. Can be freely renumbered.
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index 04cb214ef2520..de9a0c7bef132 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -122,8 +122,10 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
if (!VS.TT.isAMDGPU())
return;
- if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
- VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+ if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
+ AI.getAddressSpace() != AMDGPUAS::VGPR)
+ VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
+ &AI);
}
bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index c6dd1dbb62449..3336ea6d1f943 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca();
+FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
@@ -276,6 +276,20 @@ struct AMDGPUPromoteAllocaPass
TargetMachine &TM;
};
+void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
+extern char &AMDGPUPrivateObjectVGPRsID;
+
+// Allocates pre-existing VGPR address space allocas without performing any
+// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
+// objects remain functional.
+struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
+ AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+};
+
struct AMDGPUPromoteAllocaToVectorPass
: OptionalPassInfoMixin<AMDGPUPromoteAllocaToVectorPass> {
AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 7330f3b13f3cb..8e289058a2ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,8 +21,10 @@
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -341,25 +343,159 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
-void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- if (!Subtarget->d16PreservesUnusedBits())
- return;
+// Resolve the constant byte offset within the per-function VGPR file for a
+// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
+// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
+// object.
+static std::optional<unsigned>
+getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
+ unsigned ExtraOffset = 0;
+ if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
+ ExtraOffset = C->getZExtValue();
+ Ptr = Ptr.getOperand(0);
+ }
+ }
+ auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FI)
+ return std::nullopt;
+ const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+ if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+ return std::nullopt;
+ return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+}
+
+// Lower a load/store of a "VGPR as memory" object into one
+// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
+// offset. The pseudos are later expanded into subregister copies by
+// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
+// split into their dword lanes; sub-dword and non-dword-multiple accesses are
+// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
+// true if \p N was rewritten.
+bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
+ if (auto *Load = dyn_cast<LoadSDNode>(N)) {
+ if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
+ Load->getExtensionType() != ISD::NON_EXTLOAD)
+ return false;
+ EVT VT = Load->getValueType(0);
+ unsigned Bits = VT.getFixedSizeInBits();
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ std::optional<unsigned> Offset =
+ getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
+ if (!Offset || (*Offset % 4 != 0))
+ return false;
+
+ SDLoc DL(N);
+ unsigned NumDwords = Bits / 32;
+ SmallVector<SDValue, 4> Dwords;
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned I = 0; I != NumDwords; ++I) {
+ SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+ Load->getChain()};
+ MachineSDNode *Lane = CurDAG->getMachineNode(
+ AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
+ if (I == 0)
+ CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
+ Dwords.push_back(SDValue(Lane, 0));
+ Chains.push_back(SDValue(Lane, 1));
+ }
+
+ SDValue Val;
+ if (NumDwords == 1) {
+ Val = Dwords[0];
+ if (VT != MVT::i32)
+ Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
+ } else {
+ EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+ SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
+ Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
+ }
+ SDValue Chain = NumDwords == 1 ? Chains[0]
+ : CurDAG->getNode(ISD::TokenFactor, DL,
+ MVT::Other, Chains);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
+ return true;
+ }
+
+ if (auto *Store = dyn_cast<StoreSDNode>(N)) {
+ if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
+ Store->isTruncatingStore())
+ return false;
+ SDValue Val = Store->getValue();
+ EVT VT = Val.getValueType();
+ unsigned Bits = VT.getFixedSizeInBits();
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ std::optional<unsigned> Offset =
+ getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
+ if (!Offset || (*Offset % 4 != 0))
+ return false;
+
+ SDLoc DL(N);
+ unsigned NumDwords = Bits / 32;
+ SmallVector<SDValue, 4> Dwords;
+ if (NumDwords == 1) {
+ if (VT != MVT::i32)
+ Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
+ Dwords.push_back(Val);
+ } else {
+ EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
+ SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
+ for (unsigned I = 0; I != NumDwords; ++I)
+ Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+ Vec,
+ CurDAG->getConstant(I, DL, MVT::i32)));
+ }
+
+ SmallVector<SDValue, 4> Chains;
+ for (unsigned I = 0; I != NumDwords; ++I) {
+ SDValue Ops[] = {Dwords[I],
+ CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
+ Store->getChain()};
+ MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
+ DL, MVT::Other, Ops);
+ if (I == 0)
+ CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
+ Chains.push_back(SDValue(Lane, 0));
+ }
+ SDValue Chain = NumDwords == 1 ? Chains[0]
+ : CurDAG->getNode(ISD::TokenFactor, DL,
+ MVT::Other, Chains);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
+ return true;
+ }
- SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+ return false;
+}
+void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
- while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = &*--Position;
- if (N->use_empty())
- continue;
-
- switch (N->getOpcode()) {
- case ISD::BUILD_VECTOR:
- // TODO: Match load d16 from shl (extload:i16), 16
- MadeChange |= matchLoadD16FromBuildVector(N);
- break;
- default:
- break;
+
+ // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
+ // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
+ SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
+ while (VGPRPos != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--VGPRPos;
+ MadeChange |= rewriteVGPRFrameAccess(N);
+ }
+
+ if (Subtarget->d16PreservesUnusedBits()) {
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // TODO: Match load d16 from shl (extload:i16), 16
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 95f85a6151375..cf62874912742 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,6 +67,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool runOnMachineFunction(MachineFunction &MF) override;
bool matchLoadD16FromBuildVector(SDNode *N) const;
+ bool rewriteVGPRFrameAccess(SDNode *N);
void PreprocessISelDAG() override;
void Select(SDNode *N) override;
void PostprocessISelDAG() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 2a6560b309e62..b377704c2f296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -67,6 +67,7 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
AMDGPUPromoteAllocaToVectorPass(*this))
+FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
FUNCTION_PASS("amdgpu-promote-kernel-arguments",
AMDGPUPromoteKernelArgumentsPass())
FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
new file mode 100644
index 0000000000000..a3a1cf6f18bed
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -0,0 +1,145 @@
+//===-- AMDGPUPrivateObjectVGPRs.cpp - Lower VGPR-as-memory accesses ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
+/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
+/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
+/// carries a constant byte offset, which selects the dword (subregister) to
+/// copy.
+///
+/// This runs once the function is out of SSA form (so the single backing tuple
+/// can be defined by several subregister copies) and while LiveIntervals is
+/// available. The backing tuple has lane-divergent liveness (its subregisters
+/// are written and read independently), which the whole-register LiveVariables
+/// analysis cannot represent; the pass therefore updates the subregister-aware
+/// LiveIntervals directly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-private-object-vgprs"
+
+namespace {
+
+class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPrivateObjectVGPRs() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU Private Object VGPRs";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LiveIntervalsWrapperPass>();
+ AU.addPreserved<LiveIntervalsWrapperPass>();
+ AU.addPreserved<SlotIndexesWrapperPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(AMDGPUPrivateObjectVGPRs, DEBUG_TYPE,
+ "AMDGPU Private Object VGPRs", false, false)
+
+char AMDGPUPrivateObjectVGPRs::ID = 0;
+
+char &llvm::AMDGPUPrivateObjectVGPRsID = AMDGPUPrivateObjectVGPRs::ID;
+
+bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Collect the pseudos and determine how many dwords the backing tuple needs.
+ SmallVector<MachineInstr *, 8> Worklist;
+ unsigned NumDwords = 0;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ unsigned Opc = MI.getOpcode();
+ if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
+ Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+ continue;
+ unsigned ByteOffset = MI.getOperand(1).getImm();
+ NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+ Worklist.push_back(&MI);
+ }
+ }
+
+ if (Worklist.empty())
+ return false;
+
+ LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+
+ const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
+ assert(RC && "no VGPR register class for VGPR-as-memory object");
+ Register Storage = MRI.createVirtualRegister(RC);
+
+ // Define the whole tuple up front so partial (subregister) writes and reads
+ // of uninitialized lanes are well formed.
+ MachineBasicBlock &Entry = MF.front();
+ MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
+ TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
+ LIS->InsertMachineInstrInMaps(*ImpDef);
+
+ for (MachineInstr *MI : Worklist) {
+ MachineBasicBlock &MBB = *MI->getParent();
+ const DebugLoc &DL = MI->getDebugLoc();
+ unsigned Dword = MI->getOperand(1).getImm() / 4;
+ unsigned SubReg = NumDwords == 1
+ ? AMDGPU::NoSubRegister
+ : SIRegisterInfo::getSubRegFromChannel(Dword);
+
+ MachineInstr *Copy;
+ if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+ Register Dst = MI->getOperand(0).getReg();
+ Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
+ .addReg(Storage, {}, SubReg);
+ } else {
+ Register Src = MI->getOperand(0).getReg();
+ Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
+ .addReg(Storage, RegState::Define, SubReg)
+ .addReg(Src);
+ }
+ // The copy takes the pseudo's slot, so the intervals of the copied
+ // load/store operand stay valid.
+ LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
+ MI->eraseFromParent();
+ }
+
+ // The backing tuple is brand new; compute its (subregister) live interval.
+ LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
+
+ // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
+ // form disconnected value-number components within the single tuple, which an
+ // individual live interval must not contain. Split them into separate
+ // virtual registers, exactly as the register coalescer does for the intervals
+ // it leaves behind.
+ SmallVector<LiveInterval *, 4> SplitLIs;
+ LIS->splitSeparateComponents(LI, SplitLIs);
+
+ return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 95e06dc8295d9..32ab847c8d8f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,6 +35,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -138,6 +139,7 @@ class AMDGPUPromoteAllocaImpl {
unsigned MaxVGPRs;
unsigned VGPRBudgetRatio;
unsigned MaxVectorRegs;
+ unsigned AllocVGPROffset = 0;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -162,6 +164,10 @@ class AMDGPUPromoteAllocaImpl {
void analyzePromoteToVector(AllocaAnalysis &AA) const;
void promoteAllocaToVector(AllocaAnalysis &AA);
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
+
+ /// Allocate an alloca that already lives in the VGPR address space to a range
+ /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
+ void allocateVgprs(AllocaAnalysis &AA);
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
SetVector<IntrinsicInst *> &DeferredIntrs);
void
@@ -179,7 +185,11 @@ class AMDGPUPromoteAllocaImpl {
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
}
- bool run(Function &F, bool PromoteToLDS);
+ /// IsLatePass is true when invoked as a codegen pass and false when invoked
+ /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
+ /// requests only the work strictly required for functionality (i.e. VGPR
+ /// allocation), skipping the optimization-oriented promotions.
+ bool run(Function &F, bool IsLatePass, bool NoOpt);
};
// FIXME: This can create globals so should be a module pass.
@@ -187,26 +197,34 @@ class AMDGPUPromoteAlloca : public FunctionPass {
public:
static char ID;
- AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+ explicit AMDGPUPromoteAlloca(
+ CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
+ : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
- .run(F, /*PromoteToLDS*/ true);
+ .run(F, /*IsLatePass=*/true, NoOpt);
+ }
return false;
}
- StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+ StringRef getPassName() const override {
+ return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
+
+private:
+ bool NoOpt;
};
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -253,7 +271,8 @@ char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
+ /*NoOpt=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -265,7 +284,8 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/false,
+ /*NoOpt=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -274,8 +294,21 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
return PreservedAnalyses::all();
}
-FunctionPass *llvm::createAMDGPUPromoteAlloca() {
- return new AMDGPUPromoteAlloca();
+PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
+ /*NoOpt=*/true);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+ return PreservedAnalyses::all();
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
+ return new AMDGPUPromoteAlloca(OptLevel);
}
bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -368,9 +401,110 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
}
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
- if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
+// A "VGPR as memory" object can only be realized in registers today when every
+// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
+// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
+// indexing and escaping addresses need gfx13 support, which is not yet
+// available; such objects fall back to scratch instead.
+//
+// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
+// support so this fallback is no longer needed.
+static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
+ const DataLayout &DL) {
+ // An access is lowerable if it covers a whole number of dwords and starts at
+ // a dword-aligned constant offset from the alloca.
+ auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
+ if (!Simple)
+ return false;
+ uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
+ if (Bits == 0 || Bits % 32 != 0)
+ return false;
+ APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
+ DL, Off, /*AllowNonInbounds=*/true);
+ return Base == &AI && Off.urem(4) == 0;
+ };
+
+ SmallVector<const Use *, 16> Worklist;
+ for (const Use &U : AI.uses())
+ Worklist.push_back(&U);
+
+ while (!Worklist.empty()) {
+ const Use *U = Worklist.pop_back_val();
+ const User *Usr = U->getUser();
+
+ if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+ if (!GEP->hasAllConstantIndices())
+ return false;
+ for (const Use &GU : GEP->uses())
+ Worklist.push_back(&GU);
+ continue;
+ }
+ if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
+ if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
+ return false;
+ continue;
+ }
+ if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
+ // The pointer must be the address operand, not a stored value (escape).
+ if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
+ return false;
+ if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
+ SI->isSimple()))
+ return false;
+ continue;
+ }
+ // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
+ // otherwise not statically lowerable.
return false;
+ }
+ return true;
+}
+
+// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
+// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
+// falls back to ordinary scratch.
+static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
+ SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
+ for (Use *U : Uses) {
+ User *Usr = U->getUser();
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
+ IRBuilder<> B(GEP);
+ SmallVector<Value *, 4> Indices(GEP->indices());
+ Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
+ GEP->getName(), GEP->getNoWrapFlags());
+ rewriteVGPRPointerToScratch(GEP, NewGEP);
+ GEP->eraseFromParent();
+ continue;
+ }
+ if (auto *II = dyn_cast<IntrinsicInst>(Usr);
+ II && II->isLifetimeStartOrEnd()) {
+ II->eraseFromParent();
+ continue;
+ }
+ // Loads, stores, address-space casts and call arguments only need this
+ // operand repointed; their result types do not depend on the operand's
+ // address space.
+ U->set(New);
+ }
+}
+
+static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
+ auto *NewAI = new AllocaInst(
+ AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
+ AI->getAlign(), AI->getName(), AI->getIterator());
+ NewAI->setDebugLoc(AI->getDebugLoc());
+ rewriteVGPRPointerToScratch(AI, NewAI);
+ AI->eraseFromParent();
+}
+
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
+ assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
+ if (!IsLatePass && DisablePromoteAllocaToVector)
+ return false;
+
+ bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
+ bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
Mod = F.getParent();
DL = &Mod->getDataLayout();
@@ -379,6 +513,12 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
setFunctionLimits(F);
+ // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
+ // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ const bool TargetSupportsVGPRAsMemory =
+ ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
+
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
@@ -395,8 +535,18 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
AllocaAnalysis AA{AI};
+ if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
+ // Allocas that already live in the VGPR address space only need to be
+ // assigned VGPRs, which is required for functionality.
+ if (IsLatePass)
+ Allocas.push_back(std::move(AA));
+ continue;
+ }
+ if (!PromoteToVector && !PromoteToLDS)
+ continue;
if (collectAllocaUses(AA)) {
- analyzePromoteToVector(AA);
+ if (PromoteToVector)
+ analyzePromoteToVector(AA);
if (PromoteToLDS)
analyzePromoteToLDS(AA);
if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -407,8 +557,15 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
}
}
- stable_sort(Allocas,
- [](const auto &A, const auto &B) { return A.Score > B.Score; });
+ stable_sort(Allocas, [](const auto &A, const auto &B) {
+ // Prioritize pre-existing VGPR allocas, since their allocation must not
+ // fail.
+ bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+ bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
+ if (AIsVGPR != BIsVGPR)
+ return AIsVGPR;
+ return A.Score > B.Score;
+ });
// clang-format off
LLVM_DEBUG(
@@ -421,6 +578,39 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
bool Changed = false;
SetVector<IntrinsicInst *> DeferredIntrs;
for (AllocaAnalysis &AA : Allocas) {
+ if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
+ // Fall back to scratch (and warn) when the object can't be kept in
+ // registers, so the program still compiles correctly: either the target
+ // does not support "VGPR as memory", or the access pattern (dynamic
+ // index, sub-dword, escaping address) is not yet supported.
+ const char *Unsupported = nullptr;
+ if (!TargetSupportsVGPRAsMemory)
+ Unsupported = "not supported on this target";
+ else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
+ Unsupported = "dynamic indexing, sub-dword access, or escaping address "
+ "is not yet supported";
+ if (Unsupported) {
+ F.getContext().diagnose(DiagnosticInfoUnsupported(
+ F,
+ Twine("'amdgpu_vgpr' object could not be kept in vector registers "
+ "(") +
+ Unsupported + "); using scratch memory instead",
+ AA.Alloca->getDebugLoc(), DS_Warning));
+ demoteVGPRAllocaToScratch(AA.Alloca);
+ Changed = true;
+ continue;
+ }
+ const unsigned AllocaCost =
+ AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
+ allocateVgprs(AA);
+ // Account for the consumed VGPRs in the vectorization budget.
+ if (VectorizationBudget > AllocaCost)
+ VectorizationBudget -= AllocaCost;
+ else
+ VectorizationBudget = 0;
+ Changed = true;
+ continue;
+ }
if (AA.Vector.Ty) {
std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
assert(Size); // Expected to succeed on non-array alloca.
@@ -455,6 +645,21 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
return Changed;
}
+void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
+ LLVMContext &Ctx = Mod->getContext();
+ const unsigned AllocaSize =
+ DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
+
+ // Record where the object was allocated within the VGPR file.
+ Type *I32 = Type::getInt32Ty(Ctx);
+ AA.Alloca->setMetadata(
+ "amdgpu.allocated.vgprs",
+ MDNode::get(
+ Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
+ ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
+ AllocVGPROffset += alignTo(AllocaSize, 4);
+}
+
// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ae6e6d0bdcd1e..5814862a514b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -668,6 +668,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
+ initializeAMDGPUPrivateObjectVGPRsPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1500,9 +1501,12 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAtomicExpandLegacyPass());
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
- addPass(createAMDGPUPromoteAlloca());
+ // With optimizations enabled, do the full promotion of allocas. Without
+ // optimizations, this only allocates pre-existing VGPR address space allocas,
+ // which is required for functionality.
+ addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
@@ -1717,6 +1721,11 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+ // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+ // At O0 there is no register coalescer; anchor on TwoAddress, where
+ // LiveIntervals is already available.
+ insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
+
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
TargetPassConfig::addFastRegAlloc();
@@ -1743,6 +1752,12 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
+ // Lower "VGPR as memory" accesses to register copies once out of SSA form.
+ // This runs after the coalescer so it does not perturb the kill flags that
+ // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
+ // the LiveIntervals the register allocator consumes next.
+ insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
+
if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
@@ -2283,8 +2298,15 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
addFunctionPass(AtomicExpandPass(TM), PMW);
- if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ // With optimizations enabled, do the full promotion of allocas. Without
+ // optimizations, only allocate pre-existing VGPR address space allocas, which
+ // is required for functionality.
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
+ else
+ addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
+
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses(PMW);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 46edc44e2cc05..dd25ab71997d7 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -100,6 +100,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUPreloadKernArgProlog.cpp
AMDGPUPreloadKernelArguments.cpp
AMDGPUPrintfRuntimeBinding.cpp
+ AMDGPUPrivateObjectVGPRs.cpp
AMDGPUPromoteAlloca.cpp
AMDGPUPromoteKernelArguments.cpp
AMDGPURegBankCombiner.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 750cb1973e21f..3594caef86782 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,6 +1243,25 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
}
} // End Spill = 1, VALU = 1, isConvergent = 1
+// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
+// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
+// within the per-function VGPR file. They are produced during instruction
+// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
+// pass before register allocation.
+let hasSideEffects = 0 in {
+def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
+ (ins i32imm:$offset)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
+ (ins VGPR_32:$vdata, i32imm:$offset)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+}
+} // End hasSideEffects = 0
+
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 96571dd028b14..7528cd2a009a3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,6 +18,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
@@ -1779,6 +1780,17 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
return false;
}
+AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
+ const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
+ assert(MD && MD->getNumOperands() == 2 &&
+ "expected !amdgpu.allocated.vgprs metadata with 2 operands");
+ unsigned Address =
+ mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+ unsigned Size =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ return {Address, Size};
+}
+
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 1623dc72d2810..b34dde7cb2cd7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -30,6 +30,7 @@ struct amd_kernel_code_t;
namespace llvm {
struct Align;
+class AllocaInst;
class Argument;
class Function;
class GlobalValue;
@@ -1032,6 +1033,16 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
+/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
+/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
+/// to within the VGPR file, and its size in bytes.
+struct AllocatedVGPRsMetadata {
+ unsigned Address;
+ unsigned Size;
+
+ static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
+};
+
// The following methods are only meaningful on targets that support
// S_WAITCNT.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
new file mode 100644
index 0000000000000..f6c64c5121867
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
+
+define void @vgpr_alloca() {
+; CHECK-LABEL: define void @vgpr_alloca(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ ret void
+}
+
+define void @vgpr_alloca_multiple() {
+; CHECK-LABEL: define void @vgpr_alloca_multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
+; CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
+; CHECK-NEXT: store i32 0, ptr addrspace(13) [[B]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca i32, align 4, addrspace(13)
+ %b = alloca [2 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ store i32 0, ptr addrspace(13) %b
+ ret void
+}
+
+define void @private_alloca_unchanged() {
+; CHECK-LABEL: define void @private_alloca_unchanged(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: store i64 42, ptr addrspace(5) [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i64], align 4, addrspace(5)
+ store i64 42, ptr addrspace(5) %a
+ ret void
+}
+
+declare void @use(ptr)
+
+; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
+; back to ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
+; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
+; CHECK-NEXT: store i32 [[V]], ptr addrspace(5) [[P2]], align 4
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+; A VGPR object whose address escapes (here via a cast to a generic pointer, as
+; the frontend emits) cannot be kept in registers yet, so it falls back to
+; ordinary (addrspace(5)) scratch.
+define void @vgpr_alloca_escaping() {
+; CHECK-LABEL: define void @vgpr_alloca_escaping(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
+; CHECK-NEXT: call void @use(ptr [[CAST]])
+; CHECK-NEXT: ret void
+;
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %cast = addrspacecast ptr addrspace(13) %a to ptr
+ call void @use(ptr %cast)
+ ret void
+}
+
+; Whole-dword-multiple accesses (here i64) stay in VGPRs.
+define void @vgpr_alloca_i64(i64 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_i64(
+; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
+; CHECK-NEXT: store i64 [[V]], ptr addrspace(13) [[A]], align 8
+; CHECK-NEXT: ret void
+;
+ %a = alloca i64, align 8, addrspace(13)
+ store i64 %v, ptr addrspace(13) %a
+ ret void
+}
+
+; Sub-dword accesses are not supported yet, so the object falls back to scratch.
+define void @vgpr_alloca_subdword(i16 %v) {
+; CHECK-LABEL: define void @vgpr_alloca_subdword(
+; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
+; CHECK-NEXT: store i16 [[V]], ptr addrspace(5) [[A1]], align 2
+; CHECK-NEXT: ret void
+;
+ %a = alloca [2 x i16], align 4, addrspace(13)
+ store i16 %v, ptr addrspace(13) %a
+ ret void
+}
+;.
+; CHECK: [[META0]] = !{i32 0, i32 16}
+; CHECK: [[META1]] = !{i32 0, i32 4}
+; CHECK: [[META2]] = !{i32 4, i32 8}
+; CHECK: [[META3]] = !{i32 0, i32 8}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
new file mode 100644
index 0000000000000..63ba44b479279
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -0,0 +1,20 @@
+; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
+; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
+; scratch.
+
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
+
+define void @vgpr_obj() {
+; SUPP: alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
+; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ store i32 0, ptr addrspace(13) %a
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
new file mode 100644
index 0000000000000..ea914907a900d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
@@ -0,0 +1,58 @@
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
+; indices must lower to register copies, never to scratch/buffer memory traffic.
+
+; CHECK-LABEL: store_load_i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_i32(i32 %v) {
+ %a = alloca i32, align 4, addrspace(13)
+ store i32 %v, ptr addrspace(13) %a
+ %l = load i32, ptr addrspace(13) %a
+ %r = add i32 %l, 1
+ ret i32 %r
+}
+
+; CHECK-LABEL: store_load_array:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i32 @store_load_array(i32 %v) {
+ %a = alloca [4 x i32], align 4, addrspace(13)
+ %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
+ %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
+ store i32 %v, ptr addrspace(13) %p1
+ store i32 7, ptr addrspace(13) %p3
+ %l1 = load i32, ptr addrspace(13) %p1
+ %l3 = load i32, ptr addrspace(13) %p3
+ %s = add i32 %l1, %l3
+ ret i32 %s
+}
+
+; A 64-bit (two-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_i64:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define i64 @store_load_i64(i64 %v) {
+ %a = alloca i64, align 8, addrspace(13)
+ store i64 %v, ptr addrspace(13) %a
+ %l = load i64, ptr addrspace(13) %a
+ %r = add i64 %l, 1
+ ret i64 %r
+}
+
+; A vector (four-dword) access is split into per-dword register copies.
+; CHECK-LABEL: store_load_v4i32:
+; CHECK-NOT: scratch_
+; CHECK-NOT: buffer_
+; CHECK: s_setpc_b64
+define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
+ %a = alloca <4 x i32>, align 16, addrspace(13)
+ store <4 x i32> %v, ptr addrspace(13) %a
+ %l = load <4 x i32>, ptr addrspace(13) %a
+ ret <4 x i32> %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 73f807e9d55c5..94173fb7b11d2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -29,6 +29,7 @@
; GCN-O0-NEXT: amdgpu-lower-module-lds
; GCN-O0-NEXT: function
; GCN-O0-NEXT: atomic-expand
+; GCN-O0-NEXT: amdgpu-vgpr-allocate
; GCN-O0-NEXT: verify
; GCN-O0-NEXT: unreachableblockelim
; GCN-O0-NEXT: ee-instrument<post-inline>
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 070c873798647..aabfadd33e976 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -49,11 +49,13 @@
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand Atomic instructions
+; GCN-O0-NEXT: Dominator Tree Construction
+; GCN-O0-NEXT: Natural Loop Information
+; GCN-O0-NEXT: AMDGPU VGPR Allocate
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
-; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: AMDGPU lower intrinsics
@@ -115,6 +117,7 @@
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
+; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
@@ -359,6 +362,7 @@
; GCN-O1-NEXT: Live Interval Analysis
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Register Coalescer
+; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: Rename Disconnected Subregister Components
; GCN-O1-NEXT: Rewrite Partial Register Uses
; GCN-O1-NEXT: Machine Instruction Scheduler
@@ -676,6 +680,7 @@
; GCN-O1-OPTS-NEXT: Live Interval Analysis
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Register Coalescer
+; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
; GCN-O1-OPTS-NEXT: Rewrite Partial Register Uses
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
@@ -998,6 +1003,7 @@
; GCN-O2-NEXT: Live Interval Analysis
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Register Coalescer
+; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: Rename Disconnected Subregister Components
; GCN-O2-NEXT: Rewrite Partial Register Uses
; GCN-O2-NEXT: Machine Instruction Scheduler
@@ -1334,6 +1340,7 @@
; GCN-O3-NEXT: Live Interval Analysis
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Register Coalescer
+; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: Rename Disconnected Subregister Components
; GCN-O3-NEXT: Rewrite Partial Register Uses
; GCN-O3-NEXT: Machine Instruction Scheduler
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index f31d6228d7936..bd760de79c9d0 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,23 +2,23 @@
target triple = "amdgcn-amd-amdhsa"
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
define void @static_alloca() {
entry:
@@ -32,26 +32,27 @@ entry:
%alloca.7 = alloca i32, align 4, addrspace(7)
%alloca.8 = alloca i32, align 4, addrspace(8)
%alloca.9 = alloca i32, align 4, addrspace(9)
+ %alloca.13 = alloca i32, align 4, addrspace(13)
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
define void @dynamic_alloca_i32(i32 %n) {
entry:
@@ -68,23 +69,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5)
+; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
define void @dynamic_alloca_i64(i64 %n) {
entry:
>From 8e5a8ec027c51b92143f95404e3ad6f068300742 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Wed, 24 Jun 2026 19:32:40 -0500
Subject: [PATCH 2/3] Lower VGPR-as-memory accesses via REG_LOAD/REG_STORE
nodes
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 405 ++++++------------
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 3 +-
.../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp | 70 ++-
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 10 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 121 +++++-
llvm/lib/Target/AMDGPU/SIISelLowering.h | 113 +++--
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 13 +
llvm/lib/Target/AMDGPU/SIInstructions.td | 51 ++-
.../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll | 3 +-
9 files changed, 405 insertions(+), 384 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 8e289058a2ed1..66c9353cd5c33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -68,7 +68,8 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ConstantSDNode *ShiftAmt =
+ dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
if (ShiftAmt->getZExtValue() == 16) {
Out = stripBitcast(Srl.getOperand(0));
return true;
@@ -284,22 +285,20 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
- SDValue Ops[] = {
- LdHi->getChain(), LdHi->getBasePtr(), TiedIn
- };
+ SDValue Ops[] = {LdHi->getChain(), LdHi->getBasePtr(), TiedIn};
unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
if (LdHi->getMemoryVT() == MVT::i8) {
- LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
- AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
+ LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD
+ ? AMDGPUISD::LOAD_D16_HI_I8
+ : AMDGPUISD::LOAD_D16_HI_U8;
} else {
assert(LdHi->getMemoryVT() == MVT::i16);
}
SDValue NewLoadHi =
- CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
- Ops, LdHi->getMemoryVT(),
- LdHi->getMemOperand());
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, Ops,
+ LdHi->getMemoryVT(), LdHi->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
@@ -318,22 +317,20 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
if (LdLo->getMemoryVT() == MVT::i8) {
- LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
- AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
+ LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD
+ ? AMDGPUISD::LOAD_D16_LO_I8
+ : AMDGPUISD::LOAD_D16_LO_U8;
} else {
assert(LdLo->getMemoryVT() == MVT::i16);
}
TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
- SDValue Ops[] = {
- LdLo->getChain(), LdLo->getBasePtr(), TiedIn
- };
+ SDValue Ops[] = {LdLo->getChain(), LdLo->getBasePtr(), TiedIn};
SDValue NewLoadLo =
- CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
- Ops, LdLo->getMemoryVT(),
- LdLo->getMemOperand());
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, Ops,
+ LdLo->getMemoryVT(), LdLo->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
@@ -343,144 +340,9 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
return false;
}
-// Resolve the constant byte offset within the per-function VGPR file for a
-// "VGPR as memory" access whose (legalized) address is \p Ptr. Returns
-// std::nullopt if \p Ptr is not a constant offset from a VGPR-as-memory frame
-// object.
-static std::optional<unsigned>
-getVGPRFrameByteOffset(SDValue Ptr, const MachineFunction &MF) {
- unsigned ExtraOffset = 0;
- if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
- if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1))) {
- ExtraOffset = C->getZExtValue();
- Ptr = Ptr.getOperand(0);
- }
- }
- auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
- if (!FI)
- return std::nullopt;
- const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
- if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
- return std::nullopt;
- return AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
-}
-
-// Lower a load/store of a "VGPR as memory" object into one
-// SI_VGPR_FRAME_{LOAD,STORE} pseudo per dword, each carrying a constant byte
-// offset. The pseudos are later expanded into subregister copies by
-// AMDGPUPrivateObjectVGPRs. Accesses wider than a dword (e.g. i64, vectors) are
-// split into their dword lanes; sub-dword and non-dword-multiple accesses are
-// left alone (AMDGPUPromoteAlloca demotes such objects to scratch). Returns
-// true if \p N was rewritten.
-bool AMDGPUDAGToDAGISel::rewriteVGPRFrameAccess(SDNode *N) {
- if (auto *Load = dyn_cast<LoadSDNode>(N)) {
- if (Load->getAddressSpace() != AMDGPUAS::VGPR || !Load->isSimple() ||
- Load->getExtensionType() != ISD::NON_EXTLOAD)
- return false;
- EVT VT = Load->getValueType(0);
- unsigned Bits = VT.getFixedSizeInBits();
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- std::optional<unsigned> Offset =
- getVGPRFrameByteOffset(Load->getBasePtr(), *MF);
- if (!Offset || (*Offset % 4 != 0))
- return false;
-
- SDLoc DL(N);
- unsigned NumDwords = Bits / 32;
- SmallVector<SDValue, 4> Dwords;
- SmallVector<SDValue, 4> Chains;
- for (unsigned I = 0; I != NumDwords; ++I) {
- SDValue Ops[] = {CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
- Load->getChain()};
- MachineSDNode *Lane = CurDAG->getMachineNode(
- AMDGPU::SI_VGPR_FRAME_LOAD, DL, MVT::i32, MVT::Other, Ops);
- if (I == 0)
- CurDAG->setNodeMemRefs(Lane, {Load->getMemOperand()});
- Dwords.push_back(SDValue(Lane, 0));
- Chains.push_back(SDValue(Lane, 1));
- }
-
- SDValue Val;
- if (NumDwords == 1) {
- Val = Dwords[0];
- if (VT != MVT::i32)
- Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Val);
- } else {
- EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
- SDValue Vec = CurDAG->getNode(ISD::BUILD_VECTOR, DL, VecVT, Dwords);
- Val = CurDAG->getNode(ISD::BITCAST, DL, VT, Vec);
- }
- SDValue Chain = NumDwords == 1 ? Chains[0]
- : CurDAG->getNode(ISD::TokenFactor, DL,
- MVT::Other, Chains);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 0), Val);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Load, 1), Chain);
- return true;
- }
-
- if (auto *Store = dyn_cast<StoreSDNode>(N)) {
- if (Store->getAddressSpace() != AMDGPUAS::VGPR || !Store->isSimple() ||
- Store->isTruncatingStore())
- return false;
- SDValue Val = Store->getValue();
- EVT VT = Val.getValueType();
- unsigned Bits = VT.getFixedSizeInBits();
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- std::optional<unsigned> Offset =
- getVGPRFrameByteOffset(Store->getBasePtr(), *MF);
- if (!Offset || (*Offset % 4 != 0))
- return false;
-
- SDLoc DL(N);
- unsigned NumDwords = Bits / 32;
- SmallVector<SDValue, 4> Dwords;
- if (NumDwords == 1) {
- if (VT != MVT::i32)
- Val = CurDAG->getNode(ISD::BITCAST, DL, MVT::i32, Val);
- Dwords.push_back(Val);
- } else {
- EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), MVT::i32, NumDwords);
- SDValue Vec = CurDAG->getNode(ISD::BITCAST, DL, VecVT, Val);
- for (unsigned I = 0; I != NumDwords; ++I)
- Dwords.push_back(CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
- Vec,
- CurDAG->getConstant(I, DL, MVT::i32)));
- }
-
- SmallVector<SDValue, 4> Chains;
- for (unsigned I = 0; I != NumDwords; ++I) {
- SDValue Ops[] = {Dwords[I],
- CurDAG->getTargetConstant(*Offset + I * 4, DL, MVT::i32),
- Store->getChain()};
- MachineSDNode *Lane = CurDAG->getMachineNode(AMDGPU::SI_VGPR_FRAME_STORE,
- DL, MVT::Other, Ops);
- if (I == 0)
- CurDAG->setNodeMemRefs(Lane, {Store->getMemOperand()});
- Chains.push_back(SDValue(Lane, 0));
- }
- SDValue Chain = NumDwords == 1 ? Chains[0]
- : CurDAG->getNode(ISD::TokenFactor, DL,
- MVT::Other, Chains);
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(Store, 0), Chain);
- return true;
- }
-
- return false;
-}
-
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
bool MadeChange = false;
- // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into frame pseudos. This
- // is scoped to addrspace(13) nodes, so it never perturbs ordinary memory ops.
- SelectionDAG::allnodes_iterator VGPRPos = CurDAG->allnodes_end();
- while (VGPRPos != CurDAG->allnodes_begin()) {
- SDNode *N = &*--VGPRPos;
- MadeChange |= rewriteVGPRFrameAccess(N);
- }
-
if (Subtarget->d16PreservesUnusedBits()) {
SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
while (Position != CurDAG->allnodes_begin()) {
@@ -501,8 +363,7 @@ void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
if (MadeChange) {
CurDAG->RemoveDeadNodes();
- LLVM_DEBUG(dbgs() << "After PreProcess:\n";
- CurDAG->dump(););
+ LLVM_DEBUG(dbgs() << "After PreProcess:\n"; CurDAG->dump(););
}
}
@@ -524,8 +385,8 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
/// \returns The register class of the virtual register that will be used for
/// the given operand number \OpNo or NULL if the register class cannot be
/// determined.
-const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
- unsigned OpNo) const {
+const TargetRegisterClass *
+AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
if (!N->isMachineOpcode()) {
if (N->getOpcode() == ISD::CopyToReg) {
Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
@@ -563,14 +424,14 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
SDValue SubRegOp = N->getOperand(OpNo + 1);
unsigned SubRegIdx = SubRegOp->getAsZExtVal();
return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
- SubRegIdx);
+ SubRegIdx);
}
}
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
SDValue Glue) const {
- SmallVector <SDValue, 8> Ops;
+ SmallVector<SDValue, 8> Ops;
Ops.push_back(NewChain); // Replace the chain.
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));
@@ -580,8 +441,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
@@ -598,8 +459,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
} else if (AS == AMDGPUAS::REGION_ADDRESS) {
MachineFunction &MF = CurDAG->getMachineFunction();
unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
- return
- glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+ return glueCopyToM0(N,
+ CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
}
return N;
}
@@ -681,7 +542,7 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
}
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
- "supported yet");
+ "supported yet");
// 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
@@ -707,8 +568,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
if (NOps != NumVectorElts) {
// Fill in the missing undef elements if this was a scalar_to_vector.
assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
- MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
- DL, EltVT);
+ MachineSDNode *ImpDef =
+ CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
i * EltSizeInRegs, EltSizeInRegs)
@@ -817,7 +678,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
N->setNodeId(-1);
- return; // Already selected.
+ return; // Already selected.
}
// isa<MemSDNode> almost works but is slightly too permissive for some DS
@@ -905,8 +766,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
} else {
llvm_unreachable("Unhandled value type for BUILD_PAIR");
}
- const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
- N->getOperand(1), SubReg1 };
+ const SDValue Ops[] = {RC, N->getOperand(0), SubReg0, N->getOperand(1),
+ SubReg1};
ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops));
return;
@@ -957,8 +818,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
uint32_t OffsetVal = Offset->getZExtValue();
uint32_t WidthVal = Width->getZExtValue();
- ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
- WidthVal));
+ ReplaceNode(
+ N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
return;
}
case AMDGPUISD::DIV_SCALE: {
@@ -974,8 +835,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::UMUL_LOHI:
return SelectMUL_LOHI(N);
case ISD::CopyToReg: {
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
break;
}
@@ -1003,7 +864,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (N->getValueType(0) == MVT::i32) {
MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
- { N->getOperand(0), N->getOperand(1) });
+ {N->getOperand(0), N->getOperand(1)});
SelectCode(N);
return;
}
@@ -1166,7 +1027,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
- (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
Base = Addr.getOperand(0);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else {
@@ -1179,9 +1040,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
const SDLoc &DL) const {
- SDNode *Mov = CurDAG->getMachineNode(
- AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getTargetConstant(Val, DL, MVT::i32));
+ SDNode *Mov =
+ CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(Val, DL, MVT::i32));
return SDValue(Mov, 0);
}
@@ -1322,7 +1183,8 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
- // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
+ // omod
SDValue Ops[10];
SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -1360,8 +1222,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
assert(VT == MVT::f32 || VT == MVT::f64);
- unsigned Opc
- = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
+ unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64
+ : AMDGPU::V_DIV_SCALE_F32_e64;
// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
// omod
@@ -1388,8 +1250,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- Clamp };
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp};
if (UseNoCarry) {
MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
@@ -1468,7 +1329,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
}
} else if (Addr.getOpcode() == ISD::SUB) {
// sub C, x -> add (sub 0, x), C
- if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ if (const ConstantSDNode *C =
+ dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
int64_t ByteOffset = C->getSExtValue();
if (isDSOffsetLegal(SDValue(), ByteOffset)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1476,8 +1338,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// XXX - This is kind of hacky. Create a dummy sub node so we can check
// the known bits in isDSOffsetLegal. We need to emit the selected node
// here, so this is thrown away.
- SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
- Zero, Addr.getOperand(1));
+ SDValue Sub =
+ CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset)) {
SmallVector<SDValue, 3> Opnds;
@@ -1511,8 +1373,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
- MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
- DL, MVT::i32, Zero);
+ MachineSDNode *MovZero =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
Base = SDValue(MovZero, 0);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
@@ -1817,8 +1679,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (C->getSExtValue()) {
SDLoc DL(Addr);
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
return true;
@@ -1827,7 +1689,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
}
-std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+std::pair<SDValue, SDValue>
+AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
auto *FI = dyn_cast<FrameIndexSDNode>(N);
@@ -1841,9 +1704,9 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
- SDValue Addr, SDValue &Rsrc,
- SDValue &VAddr, SDValue &SOffset,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
+ SDValue &Rsrc, SDValue &VAddr,
+ SDValue &SOffset,
SDValue &ImmOffset) const {
SDLoc DL(Addr);
@@ -1861,8 +1724,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
SDValue HighBits =
CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
- MachineSDNode *MovHighBits = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+ MachineSDNode *MovHighBits =
+ CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1918,8 +1781,7 @@ static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
return RC && TRI.isSGPRClass(RC);
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
- SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
@@ -1962,8 +1824,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &SOffset, SDValue &Offset
- ) const {
+ SDValue &SOffset,
+ SDValue &Offset) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1977,8 +1839,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
maskTrailingOnes<uint64_t>(32); // Size
SDLoc DL(Addr);
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
+ const SITargetLowering &Lowering =
+ *static_cast<const SITargetLowering *>(getTargetLowering());
SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
return true;
@@ -1999,14 +1861,14 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
- N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+static MemSDNode *findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N, 0)).getNode();
if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
return MN;
assert(isa<BuildVectorSDNode>(N));
for (SDValue V : N->op_values())
if (MemSDNode *MN =
- dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
return MN;
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
@@ -2387,8 +2249,8 @@ static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
// Materialize this into a scalar move for scalar address to avoid
// readfirstlane.
auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
- SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
- FI->getValueType(0));
+ SDValue TFI =
+ CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
MVT::i32, TFI, SAddr.getOperand(1)),
0);
@@ -2488,8 +2350,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
- CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
if (!isFlatScratchBaseLegal(Addr))
@@ -2677,16 +2539,16 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
const SDValue Ops[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
- Addr,
- CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
- 0),
- CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+ CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+ Addr,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
};
- return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
- Ops), 0);
+ return SDValue(
+ CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, Ops), 0);
}
// Match a base and an immediate (if Offset is not null) or an SGPR (if
@@ -2822,8 +2684,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
-bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
- SDValue &Base,
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base,
SDValue &Offset) const {
SDLoc DL(Index);
@@ -2888,7 +2749,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
if (0 < BVal && BVal <= CVal && CVal < 32) {
bool Signed = N->getOpcode() == ISD::SRA;
ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
- 32 - CVal));
+ 32 - CVal));
return;
}
}
@@ -2933,7 +2794,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
if (isMask_32(MaskVal)) {
uint32_t WidthVal = llvm::popcount(MaskVal);
ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
- WidthVal));
+ WidthVal));
return;
}
}
@@ -3028,8 +2889,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);
if (Cond.isUndef()) {
- CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
- N->getOperand(2), N->getOperand(0));
+ CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, N->getOperand(2),
+ N->getOperand(0));
return;
}
@@ -3129,8 +2990,8 @@ void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
- unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
- AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? AMDGPU::DS_APPEND
+ : AMDGPU::DS_CONSUME;
SDValue Chain = N->getOperand(0);
SDValue Ptr = N->getOperand(2);
@@ -3156,10 +3017,8 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
}
SDValue Ops[] = {
- Offset,
- CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
- Chain,
- N->getOperand(N->getNumOperands() - 1) // New glue
+ Offset, CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), Chain,
+ N->getOperand(N->getNumOperands() - 1) // New glue
};
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -3282,14 +3141,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
// Prefer to do the shift in an SGPR since it should be possible to use m0
// as the result directly. If it's already an SGPR, it will be eliminated
// later.
- SDNode *SGPROffset
- = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
- BaseOffset);
+ SDNode *SGPROffset = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
+ MVT::i32, BaseOffset);
// Shift to offset in m0
- SDNode *M0Base
- = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
- SDValue(SGPROffset, 0),
- CurDAG->getTargetConstant(16, SL, MVT::i32));
+ SDNode *M0Base = CurDAG->getMachineNode(
+ AMDGPU::S_LSHL_B32, SL, MVT::i32, SDValue(SGPROffset, 0),
+ CurDAG->getTargetConstant(16, SL, MVT::i32));
glueCopyToM0(N, SDValue(M0Base, 0));
}
@@ -3369,27 +3226,27 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
- SDNode *InterpMov =
- CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
- CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
- N->getOperand(3), // Attr
- N->getOperand(2), // Attrchan
- ToM0.getValue(1) // In glue
- });
-
- SDNode *InterpP1LV =
- CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
- CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
- N->getOperand(1), // Src0
- N->getOperand(3), // Attr
- N->getOperand(2), // Attrchan
- CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
- SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
- N->getOperand(4), // high
- CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
- CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
- SDValue(InterpMov, 1)
- });
+ SDNode *InterpMov = CurDAG->getMachineNode(
+ AMDGPU::V_INTERP_MOV_F32, DL, VTs,
+ {
+ CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ ToM0.getValue(1) // In glue
+ });
+
+ SDNode *InterpP1LV = CurDAG->getMachineNode(
+ AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32,
+ {CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+ N->getOperand(1), // Src0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+ SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+ N->getOperand(4), // high
+ CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+ SDValue(InterpMov, 1)});
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
}
@@ -3516,8 +3373,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
}
void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
- SDValue Log2WaveSize =
- CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
+ SDValue Log2WaveSize = CurDAG->getTargetConstant(
+ Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
{N->getOperand(0), Log2WaveSize});
}
@@ -3790,14 +3647,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
if (Lo.getValueSizeInBits() > VecSize) {
Lo = CurDAG->getTargetExtractSubreg(
- (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
- MVT::getIntegerVT(VecSize), Lo);
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Lo);
}
if (Hi.getValueSizeInBits() > VecSize) {
Hi = CurDAG->getTargetExtractSubreg(
- (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
- MVT::getIntegerVT(VecSize), Hi);
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Hi);
}
assert(Lo.getValueSizeInBits() <= VecSize &&
@@ -3837,15 +3694,18 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};
Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
- Src.getValueType(), Ops), 0);
+ Src.getValueType(), Ops),
+ 0);
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
- uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
- .bitcastToAPInt().getZExtValue();
+ uint64_t Lit = cast<ConstantFPSDNode>(Lo)
+ ->getValueAPF()
+ .bitcastToAPInt()
+ .getZExtValue();
if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
@@ -4527,7 +4387,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
// 1 0 1
// 1 1 0
// 1 1 1
- const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
+ const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->isAllOnes()) {
@@ -4588,8 +4448,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
SDValue RHS = In.getOperand(1);
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
- if (!getOperandBits(LHS, LHSBits) ||
- !getOperandBits(RHS, RHSBits)) {
+ if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
Src = std::move(Backup);
return std::make_pair(0, 0);
}
@@ -4783,7 +4642,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
SDLoc SL(In);
return CurDAG->getConstant(
- C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+ C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
}
SDValue Src;
@@ -4793,7 +4652,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
return SDValue();
}
-bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
@@ -4802,7 +4661,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
unsigned Limit = 0;
bool AllUsesAcceptSReg = true;
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- Limit < 10 && U != E; ++U, ++Limit) {
+ Limit < 10 && U != E; ++U, ++Limit) {
const TargetRegisterClass *RC =
getOperandRegClass(U->getUser(), U->getOperandNo());
@@ -4872,8 +4731,8 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
}
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
- const AMDGPUTargetLowering& Lowering =
- *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
+ const AMDGPUTargetLowering &Lowering =
+ *static_cast<const AMDGPUTargetLowering *>(getTargetLowering());
bool IsModified = false;
do {
IsModified = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index cf62874912742..a06c15594bf0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -67,7 +67,6 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool runOnMachineFunction(MachineFunction &MF) override;
bool matchLoadD16FromBuildVector(SDNode *N) const;
- bool rewriteVGPRFrameAccess(SDNode *N);
void PreprocessISelDAG() override;
void Select(SDNode *N) override;
void PostprocessISelDAG() override;
@@ -274,7 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
- SDValue &Tbl) const;
+ SDValue &Tbl) const;
SDValue getHi16Elt(SDValue In) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index a3a1cf6f18bed..d8ff923619193 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -38,6 +38,50 @@ using namespace llvm;
namespace {
+static bool isVGPRFrameLoad(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B32:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B64:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B96:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B128:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B160:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B192:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B224:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B256:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B288:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B320:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B352:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B384:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B512:
+ case AMDGPU::SI_VGPR_FRAME_LOAD_B1024:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool isVGPRFrameStore(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::SI_VGPR_FRAME_STORE_B32:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B64:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B96:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B128:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B160:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B192:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B224:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B256:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B288:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B320:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B352:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B384:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B512:
+ case AMDGPU::SI_VGPR_FRAME_STORE_B1024:
+ return true;
+ default:
+ return false;
+ }
+}
+
class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
public:
static char ID;
@@ -75,16 +119,20 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
// Collect the pseudos and determine how many dwords the backing tuple needs.
+ // Each pseudo carries a constant dword index and accesses as many dwords as
+ // its data register class is wide.
SmallVector<MachineInstr *, 8> Worklist;
unsigned NumDwords = 0;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
unsigned Opc = MI.getOpcode();
- if (Opc != AMDGPU::SI_VGPR_FRAME_LOAD &&
- Opc != AMDGPU::SI_VGPR_FRAME_STORE)
+ if (!isVGPRFrameLoad(Opc) && !isVGPRFrameStore(Opc))
continue;
- unsigned ByteOffset = MI.getOperand(1).getImm();
- NumDwords = std::max(NumDwords, ByteOffset / 4 + 1);
+ unsigned Dword = MI.getOperand(1).getImm();
+ unsigned AccessDwords =
+ TRI->getRegSizeInBits(*MRI.getRegClass(MI.getOperand(0).getReg())) /
+ 32;
+ NumDwords = std::max(NumDwords, Dword + AccessDwords);
Worklist.push_back(&MI);
}
}
@@ -108,13 +156,17 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
for (MachineInstr *MI : Worklist) {
MachineBasicBlock &MBB = *MI->getParent();
const DebugLoc &DL = MI->getDebugLoc();
- unsigned Dword = MI->getOperand(1).getImm() / 4;
- unsigned SubReg = NumDwords == 1
- ? AMDGPU::NoSubRegister
- : SIRegisterInfo::getSubRegFromChannel(Dword);
+ unsigned Dword = MI->getOperand(1).getImm();
+ unsigned AccessDwords =
+ TRI->getRegSizeInBits(*MRI.getRegClass(MI->getOperand(0).getReg())) /
+ 32;
+ unsigned SubReg =
+ (Dword == 0 && AccessDwords == NumDwords)
+ ? AMDGPU::NoSubRegister
+ : SIRegisterInfo::getSubRegFromChannel(Dword, AccessDwords);
MachineInstr *Copy;
- if (MI->getOpcode() == AMDGPU::SI_VGPR_FRAME_LOAD) {
+ if (isVGPRFrameLoad(MI->getOpcode())) {
Register Dst = MI->getOperand(0).getReg();
Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
.addReg(Storage, {}, SubReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 32ab847c8d8f3..478f54b7cdfc3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -404,11 +404,11 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
// A "VGPR as memory" object can only be realized in registers today when every
// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses need gfx13 support, which is not yet
-// available; such objects fall back to scratch instead.
+// indexing and escaping addresses are not yet supported; such objects fall back
+// to scratch instead.
//
-// TODO-GFX13: Lower dynamically-indexed / escaping VGPR objects with gfx13
-// support so this fallback is no longer needed.
+// TODO: Lower dynamically-indexed / escaping VGPR objects so this fallback is no
+// longer needed.
static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
const DataLayout &DL) {
// An access is lowerable if it covers a whole number of dwords and starts at
@@ -514,7 +514,7 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
setFunctionLimits(F);
// "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
- // gfx12xx / gfx13xx. On any other target the objects fall back to scratch.
+ // GFX12 and later. On any other target the objects fall back to scratch.
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
const bool TargetSupportsVGPRAsMemory =
ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4a32b81b06ff5..db2301ba28359 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6457,7 +6457,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
else
ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
}
- ClampInstr.addReg(Src0); // src0
+ ClampInstr.addReg(Src0); // src0
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src1 mod
ClampInstr.addReg(Src1); // src1
@@ -12432,18 +12432,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
- Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
- : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
- Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
- : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
break;
}
@@ -12473,11 +12473,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0,
- DL, MVT::i8)); // swz
+ DL, MVT::i8)); // swz
Ops.push_back(
DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
- Ops.push_back(M0Val.getValue(0)); // Chain
- Ops.push_back(M0Val.getValue(1)); // Glue
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
@@ -12555,7 +12555,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(VOffset);
}
- Ops.push_back(Op.getOperand(5)); // Offset
+ Ops.push_back(Op.getOperand(5)); // Offset
unsigned Aux = Op.getConstantOperandVal(6);
Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
@@ -14330,6 +14330,92 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
+/// Lower a load/store of a "VGPR as memory" object (an alloca in
+/// AMDGPUAS::VGPR) into an AMDGPUISD::REG_{LOAD,STORE} node carrying the
+/// constant dword index of the access within the per-function VGPR file. These
+/// nodes are selected into register copies via the SI_VGPR_FRAME_* pseudos and
+/// the AMDGPUPrivateObjectVGPRs pass.
+///
+/// Returns SDValue() if the access cannot (yet) be resolved to a constant file
+/// offset; such objects are demoted to scratch by AMDGPUPromoteAlloca, so any
+/// access that survives to here is expected to fold to a constant offset.
+SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
+ SelectionDAG &DAG) const {
+ MemSDNode *MemOp = cast<MemSDNode>(Op);
+ const MachineFunction &MF = DAG.getMachineFunction();
+ SDLoc DL(Op);
+
+ // Resolve the constant byte offset of the access within the VGPR file
+ // directly from the frame index (plus a constant GEP offset); the frame index
+ // itself is not custom-lowered.
+ SDValue Ptr = MemOp->getBasePtr();
+ unsigned ExtraOffset = 0;
+ if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
+ auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1));
+ if (!C)
+ return SDValue();
+ ExtraOffset = C->getZExtValue();
+ Ptr = Ptr.getOperand(0);
+ }
+ auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
+ if (!FI)
+ return SDValue();
+ const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
+ if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
+ return SDValue();
+ unsigned ByteOffset =
+ AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
+ if (ByteOffset % 4 != 0)
+ return SDValue();
+
+ EVT MemVT = MemOp->getMemoryVT();
+ unsigned BitWidth = MemVT.getSizeInBits();
+ // Only whole-dword accesses are kept in registers; sub-dword and
+ // non-dword-multiple objects are demoted to scratch by AMDGPUPromoteAlloca.
+ if (BitWidth == 0 || BitWidth % 32 != 0)
+ return SDValue();
+ if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
+ return SDValue();
+
+ // Bail out for types we cannot handle (extending loads, truncating stores).
+ if (auto *Load = dyn_cast<LoadSDNode>(MemOp)) {
+ if (Load->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ } else if (cast<StoreSDNode>(MemOp)->isTruncatingStore()) {
+ return SDValue();
+ }
+
+ // Use a register-legal i32 / vector-of-i32 view so a single node covers the
+ // whole access; bitcast through it when the memory type is not register
+ // legal (e.g. v4i8 has already been excluded above).
+ EVT RegVT = MemVT;
+ if (!isTypeLegal(RegVT)) {
+ unsigned NumDwords = BitWidth / 32;
+ RegVT = NumDwords == 1
+ ? EVT(MVT::i32)
+ : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumDwords);
+ }
+
+ SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+ SDValue Chain = MemOp->getChain();
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Value = StoreOp->getValue();
+ if (RegVT != MemVT)
+ Value = DAG.getNode(ISD::BITCAST, DL, RegVT, Value);
+ return DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_STORE, DL, DAG.getVTList(MVT::Other),
+ {Chain, Value, Index}, MemVT, StoreOp->getMemOperand());
+ }
+
+ SDValue NewLoad = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(RegVT, MVT::Other), {Chain, Index},
+ MemVT, MemOp->getMemOperand());
+ if (RegVT == MemVT)
+ return NewLoad;
+ SDValue Value = DAG.getNode(ISD::BITCAST, DL, MemVT, NewLoad);
+ return DAG.getMergeValues({Value, NewLoad.getValue(1)}, DL);
+}
+
/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
/// by the chain and intrinsic ID. Theoretically we would also need to check the
/// specific intrinsic, but they all place the pointer operand first.
@@ -18567,6 +18653,15 @@ SDValue SITargetLowering::performSelectCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
+ // Lower "VGPR as memory" (AMDGPUAS::VGPR) accesses into AMDGPUISD::REG_{LOAD,
+ // STORE}. This is scoped to addrspace(13) memory nodes, so it never perturbs
+ // ordinary memory operations.
+ unsigned Opc = N->getOpcode();
+ if ((Opc == ISD::LOAD || Opc == ISD::STORE) &&
+ cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::VGPR)
+ if (SDValue V = LowerLoadStoreVGPR(SDValue(N, 0), DCI.DAG))
+ return V;
+
switch (N->getOpcode()) {
case ISD::ABS:
if (SDValue Res = promoteUniformUnaryOpToI32(SDValue(N, 0), DCI))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index c98426cdac0b1..37f3bb37d1aef 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -34,16 +34,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const GCNSubtarget *Subtarget;
public:
- MVT getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv::ID CC,
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const override;
- unsigned getVectorTypeBreakdownForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
- unsigned &NumIntermediates, MVT &RegisterVT) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC, EVT VT,
+ EVT &IntermediateVT,
+ unsigned &NumIntermediates,
+ MVT &RegisterVT) const override;
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
@@ -72,8 +73,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
- SDValue getPreloadedValue(SelectionDAG &DAG,
- const SIMachineFunctionInfo &MFI,
+ SDValue getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue) const;
@@ -81,8 +81,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG) const override;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
- MVT VT, unsigned Offset) const;
+ SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT,
+ unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG, bool WithChain) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
@@ -125,6 +125,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerLoadStoreVGPR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const;
@@ -133,8 +134,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
- SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
- SelectionDAG &DAG, ArrayRef<SDValue> Ops,
+ SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
@@ -151,14 +152,12 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
- SDValue getFPExtOrFPRound(SelectionDAG &DAG,
- SDValue Op,
- const SDLoc &DL,
+ SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL,
EVT VT) const;
- SDValue convertArgType(
- SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
- bool Signed, const ISD::InputArg *Arg = nullptr) const;
+ SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
+ SDValue Val, bool Signed,
+ const ISD::InputArg *Arg = nullptr) const;
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
@@ -194,13 +193,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
- SDValue performUCharToFloatCombine(SDNode *N,
- DAGCombinerInfo &DCI) const;
+ SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performSHLPtrCombine(SDNode *N,
- unsigned AS,
- EVT MemVT,
+ SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT,
DAGCombinerInfo &DCI) const;
SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
@@ -234,8 +230,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
- unsigned getFusedOpcode(const SelectionDAG &DAG,
- const SDNode *N0, const SDNode *N1) const;
+ unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0,
+ const SDNode *N1) const;
SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
DAGCombinerInfo &DCI) const;
@@ -398,7 +394,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
getPreferredVectorAction(MVT VT) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
- Type *Ty) const override;
+ Type *Ty) const override;
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
@@ -417,8 +413,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool supportSplitCSR(MachineFunction *MF) const override;
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
- MachineBasicBlock *Entry,
- const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -426,8 +422,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
- bool CanLowerReturn(CallingConv::ID CallConv,
- MachineFunction &MF, bool isVarArg,
+ bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+ bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context, const Type *RetTy) const override;
@@ -436,13 +432,11 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
- void passSpecialInputs(
- CallLoweringInfo &CLI,
- CCState &CCInfo,
- const SIMachineFunctionInfo &Info,
- SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
- SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain) const;
+ void
+ passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
@@ -454,10 +448,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool mayBeEmittedAsTailCall(const CallInst *) const override;
bool isEligibleForTailCallOptimization(
- SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
@@ -473,7 +467,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
- Register getRegisterByName(const char* RegName, LLT VT,
+ Register getRegisterByName(const char *RegName, LLT VT,
const MachineFunction &MF) const override;
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
@@ -530,8 +524,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint,
uint64_t Val) const;
- bool checkAsmConstraintValA(SDValue Op,
- uint64_t Val,
+ bool checkAsmConstraintValA(SDValue Op, uint64_t Val,
unsigned MaxSize = 64) const;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
SDValue V) const;
@@ -542,8 +535,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- void computeKnownBitsForFrameIndex(int FrameIdx,
- KnownBits &Known,
+ void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known,
const MachineFunction &MF) const override;
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R,
KnownBits &Known,
@@ -589,8 +581,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicLoad(LoadInst *LI) const override;
void emitExpandAtomicStore(StoreInst *SI) const override;
- LoadInst *
- lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+ LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
const TargetRegisterClass *getRegClassFor(MVT VT,
bool isDivergent) const override;
@@ -600,8 +591,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned
getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override;
- void allocateHSAUserSGPRs(CCState &CCInfo,
- MachineFunction &MF,
+ void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
@@ -616,28 +606,21 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSystemSGPRs(CCState &CCInfo,
- MachineFunction &MF,
+ void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv,
- bool IsShader) const;
+ CallingConv::ID CallConv, bool IsShader) const;
- void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
+ void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSpecialInputSGPRs(
- CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) const;
-
- void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
+ void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
- MachineFunction &MF,
+
+ void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+ void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8c30e53e9b4e4..f106bbdacb957 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -59,6 +59,19 @@ def GFX10Gen : GFXGen<isGFX10Only, "GFX10", "_gfx10", SIEncodingFamily.G
// modifier behavior with dx10_enable.
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
+// "VGPR as memory" (AMDGPUAS::VGPR) whole-dword load/store with a dword index
+// operand into the per-function VGPR file. When the index is a compile-time
+// constant these are selected into register copies via the SI_VGPR_FRAME_*
+// pseudos.
+def SDTRegIdxLoad : SDTypeProfile<1, 1,
+ [SDTCisVT<1, i32>]>; // dword_index
+def SDTRegIdxStore : SDTypeProfile<0, 2,
+ [SDTCisVT<1, i32>]>; // data, dword_index
+def SIreg_load : SDNode<"AMDGPUISD::REG_LOAD", SDTRegIdxLoad,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def SIreg_store : SDNode<"AMDGPUISD::REG_STORE", SDTRegIdxStore,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
def SDTSBufferLoad : SDTypeProfile<1, 3,
[ // vdata
SDTCisVT<1, v4i32>, // rsrc
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3594caef86782..80a42a66b2368 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1243,25 +1243,46 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
}
} // End Spill = 1, VALU = 1, isConvergent = 1
-// "VGPR as memory" pseudo accesses: a load/store of a single dword from/to an
-// alloca in the VGPR address space (AMDGPUAS::VGPR), at a constant byte offset
-// within the per-function VGPR file. They are produced during instruction
-// selection and rewritten into register copies by the AMDGPUPrivateObjectVGPRs
-// pass before register allocation.
+// "VGPR as memory" pseudo accesses: a load/store of a whole VGPR tuple (one or
+// more dwords) from/to an alloca in the VGPR address space (AMDGPUAS::VGPR), at
+// a constant dword index within the per-function VGPR file. They are selected
+// from AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
+// register copies by the AMDGPUPrivateObjectVGPRs pass before register
+// allocation.
let hasSideEffects = 0 in {
-def SI_VGPR_FRAME_LOAD : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins i32imm:$offset)> {
- let mayLoad = 1;
- let mayStore = 0;
-}
-
-def SI_VGPR_FRAME_STORE : VPseudoInstSI <(outs),
- (ins VGPR_32:$vdata, i32imm:$offset)> {
- let mayLoad = 0;
- let mayStore = 1;
+foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
+ VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
+ VReg_512, VReg_1024] in {
+ def SI_VGPR_FRAME_LOAD_B#rc.Size : VPseudoInstSI <
+ (outs rc:$vdst), (ins i32imm:$idx)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ }
+ def SI_VGPR_FRAME_STORE_B#rc.Size : VPseudoInstSI <
+ (outs), (ins rc:$vdata, i32imm:$idx)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ }
}
} // End hasSideEffects = 0
+// Select AMDGPUISD::REG_{LOAD,STORE} (with a constant dword index) into the
+// width-matched frame pseudo.
+multiclass VGPRFrameLoadStorePat<ValueType vt> {
+ defvar load_inst = !cast<Instruction>("SI_VGPR_FRAME_LOAD_B"#vt.Size);
+ defvar store_inst = !cast<Instruction>("SI_VGPR_FRAME_STORE_B"#vt.Size);
+ def : GCNPat<(vt (SIreg_load (i32 imm:$idx))), (load_inst imm:$idx)>;
+ def : GCNPat<(SIreg_store vt:$data, (i32 imm:$idx)),
+ (store_inst $data, imm:$idx)>;
+}
+
+foreach vt = !listconcat(
+ Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
+ Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
+ Reg288Types.types, Reg320Types.types, Reg352Types.types, Reg384Types.types,
+ Reg512Types.types, Reg1024Types.types) in
+defm : VGPRFrameLoadStorePat<vt>;
+
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
index 63ba44b479279..0a78d119ded18 100644
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
@@ -1,12 +1,11 @@
; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and gfx12xx/gfx13xx. On a supported target the object is kept in addrspace(13)
+; and GFX12 and later. On a supported target the object is kept in addrspace(13)
; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
; scratch.
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1310 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
>From 36969e8ccac76e5f0089b214cb59f0d73a3cfc33 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Fri, 26 Jun 2026 09:02:49 -0500
Subject: [PATCH 3/3] Refactor implementation of VGPR as mem according to
review comments
---
clang/include/clang/Basic/AttrDocs.td | 18 +-
.../clang/Basic/DiagnosticCommonKinds.td | 5 -
clang/lib/CodeGen/CGDecl.cpp | 47 +-
clang/test/CodeGen/target-data.c | 4 +-
.../CodeGenHIP/amdgpu-vgpr-O0-warning.hip | 14 -
clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip | 19 +
clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip | 13 +-
clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl | 2 +-
llvm/docs/AMDGPUUsage.rst | 22 +-
llvm/include/llvm/Support/AMDGPUAddrSpace.h | 9 +-
llvm/lib/IR/AutoUpgrade.cpp | 5 +
llvm/lib/IR/Verifier.cpp | 7 +
llvm/lib/IR/VerifierAMDGPU.cpp | 39 +-
llvm/lib/IR/VerifierInternal.h | 4 +
llvm/lib/Target/AMDGPU/AMDGPU.h | 15 +-
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 303 ++++++------
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 +-
.../Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp | 254 ++++++++++
llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +-
.../AMDGPU/AMDGPUPrivateObjectVGPRs.cpp | 138 +++---
.../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 233 +--------
.../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 15 +-
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 43 +-
llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 442 ++++++++++++++++--
llvm/lib/Target/AMDGPU/SIISelLowering.h | 114 +++--
llvm/lib/Target/AMDGPU/SIInstructions.td | 39 +-
.../Target/AMDGPU/SIMachineFunctionInfo.cpp | 11 +
.../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 17 +
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 54 +++
llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 9 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 12 -
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 11 -
llvm/lib/TargetParser/TargetDataLayout.cpp | 4 +-
.../AMDGPU/amdgpu-vgpr-allocate-basic.ll | 109 -----
.../CodeGen/AMDGPU/as-vgpr-alloca-arch.ll | 19 -
.../CodeGen/AMDGPU/as-vgpr-alloca-static.ll | 58 ---
llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 4 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 19 +-
llvm/test/CodeGen/AMDGPU/nullptr.ll | 2 +-
.../CodeGen/AMDGPU/sgpr-regalloc-flags.ll | 1 +
.../AMDGPU/vgpr-as-memory-constexpr.ll | 44 ++
.../CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll | 288 ++++++++++++
.../AMDGPU/vgpr-as-memory-gisel-fallback.ll | 28 ++
.../AMDGPU/vgpr-as-memory-lower-module.ll | 80 ++++
.../CodeGen/AMDGPU/vgpr-as-memory-subdword.ll | 63 +++
llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll | 73 +++
llvm/test/Verifier/AMDGPU/alloca.ll | 56 +--
llvm/test/Verifier/AMDGPU/vgpr-memory.ll | 33 ++
.../Bitcode/DataLayoutUpgradeTest.cpp | 38 +-
50 files changed, 1919 insertions(+), 923 deletions(-)
delete mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
create mode 100644 clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
create mode 100644 llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
delete mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
create mode 100644 llvm/test/Verifier/AMDGPU/vgpr-memory.ll
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index b80265a1aec1d..7439bc23f10d1 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3607,20 +3607,22 @@ An error will be given if:
def AMDGPUVGPRDocs : Documentation {
let Category = DocCatAMDGPUAttributes;
let Content = [{
-This attribute requests that a kernel-local variable be allocated in the
-"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target,
-so that accesses with statically known indices lower to vector register
-copies instead of scratch memory traffic.
+This attribute requests that a kernel-local variable be placed in the
+"VGPR as memory" address space (``addrspace(13)``) on the AMDGPU target, so that
+its accesses lower to vector register copies (constant index) or hardware
+register-indexing sequences (dynamic index) instead of scratch memory traffic.
+
+Such a variable is backed by a fixed block of vector registers rather than the
+stack, so - like an LDS/``__shared__`` variable - it is emitted as an internal
+global in ``addrspace(13)`` with a ``poison`` initializer; its contents are
+undefined until written. This is honored at every optimization level, including
+``-O0``.
Clang supports the ``__attribute__((amdgpu_vgpr))`` or
``[[clang::amdgpu_vgpr]]`` attribute in HIP/CUDA. It may only be applied to
local variables declared in a ``__global__`` (kernel) function; applying it to
a variable in a ``__device__`` or host function, or outside HIP/CUDA, is an
error.
-
-Known limitation: the request is only honored with optimizations enabled. At
-``-O0`` the variable falls back to ordinary (scratch) memory and a warning is
-emitted.
}];
}
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index fe03be43c80c7..f2ed2f4698b8d 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -319,11 +319,6 @@ def warn_stack_protection_ignore_attribute : Warning<
"'stack_protector_ignore' attribute ignored due to "
"'-fstack-protector-all' option">, InGroup<IgnoredAttributes>;
-def warn_amdgpu_vgpr_not_guaranteed_at_O0 : Warning<
- "%0 is not guaranteed to keep the variable in vector registers at -O0; "
- "it may fall back to scratch memory">,
- InGroup<DiagGroup<"amdgpu-vgpr">>;
-
def warn_slh_does_not_support_asm_goto : Warning<
"speculative load hardening does not protect functions with asm goto">,
InGroup<DiagGroup<"slh-asm-goto">>;
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bca2d11d47c6a..471746ee8522a 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1603,30 +1603,23 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
// building the instruction so that it's there even in no-asserts
// builds.
//
- // "VGPR as memory" objects keep their backing registers only once the
- // optimizing register allocator runs. At -O0 the backend cannot lower
- // these accesses (e.g. when the address escapes a basic block), so the
- // request is not honored: fall back to an ordinary (scratch) alloca and
- // warn, matching the documented behavior.
- // TODO: Lower addrspace(13) allocas at -O0 too (e.g. by spilling the
- // backing tuple to scratch) so this fallback can be removed.
+ // A "VGPR as memory" object (amdgpu_vgpr) is backed by a fixed block of
+ // vector registers rather than the stack, so - like LDS/__shared__ - it
+ // is emitted as an internal global variable in AMDGPUAS::VGPR. Its
+ // contents are not statically initializable (the backing registers have
+ // no defined initial value), so the initializer is poison.
+ // AMDGPULowerModuleVGPRs lays these out and the backend lowers accesses
+ // to register copies (constant index) or indexed moves (dynamic index).
const auto *VGPRAttr = D.getAttr<AMDGPUVGPRAttr>();
- const bool UseVGPRMemory =
- VGPRAttr && CGM.getCodeGenOpts().OptimizationLevel != 0;
- if (VGPRAttr && !UseVGPRMemory)
- CGM.getDiags().Report(D.getLocation(),
- diag::warn_amdgpu_vgpr_not_guaranteed_at_O0)
- << VGPRAttr;
-
- if (UseVGPRMemory) {
- // Allocate directly in AMDGPUAS::VGPR and keep the pointer in that
- // address space so that statically indexed accesses lower to vector
- // register copies instead of scratch memory.
- auto *AI = new llvm::AllocaInst(allocaTy, llvm::AMDGPUAS::VGPR,
- /*ArraySize=*/nullptr, D.getName(),
- AllocaInsertPt->getIterator());
- AI->setAlignment(allocaAlignment.getAsAlign());
- AllocaAddr = RawAddress(AI, allocaTy, allocaAlignment, KnownNonNull);
+ if (VGPRAttr) {
+ auto *GV = new llvm::GlobalVariable(
+ CGM.getModule(), allocaTy, /*isConstant=*/false,
+ llvm::GlobalValue::InternalLinkage,
+ llvm::PoisonValue::get(allocaTy), getStaticDeclName(CGM, D),
+ /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+ llvm::AMDGPUAS::VGPR);
+ GV->setAlignment(allocaAlignment.getAsAlign());
+ AllocaAddr = RawAddress(GV, allocaTy, allocaAlignment, KnownNonNull);
address = AllocaAddr;
} else {
address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(),
@@ -1641,10 +1634,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
D.isExceptionVariable() && getTarget().getCXXABI().isMicrosoft();
// Emit a lifetime intrinsic if meaningful. There's no point in doing this
- // if we don't have a valid insertion point (?). "VGPR as memory" allocas
- // live in a non-alloca address space, so the standard lifetime markers
- // (which assume the alloca address space) are skipped for them.
- if (HaveInsertPoint() && !IsMSCatchParam && !UseVGPRMemory) {
+ // if we don't have a valid insertion point (?). "VGPR as memory" objects
+ // are globals, not allocas, so the standard lifetime markers (which
+ // assume a stack slot) are skipped for them.
+ if (HaveInsertPoint() && !IsMSCatchParam && !VGPRAttr) {
// If there's a jump into the lifetime of this variable, its lifetime
// gets broken up into several regions in IR, which requires more work
// to handle correctly. For now, just omit the intrinsics; this is a
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index a5e0b814c7042..f03aaba8b53dd 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,12 +160,12 @@
// RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SI: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// Test default -target-cpu
// RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
// RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// R600SIDefault: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
// RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=AARCH64
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
deleted file mode 100644
index 4d23008b8ef43..0000000000000
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-O0-warning.hip
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
-// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
-//
-// At -O0 "VGPR as memory" is not honored: the variable falls back to an
-// ordinary (scratch) alloca in addrspace(5) and a warning is emitted.
-
-#define __global__ __attribute__((global))
-
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(5)
-__global__ void kernel(int *out, int i) {
- int buf[4] __attribute__((amdgpu_vgpr)); // expected-warning {{'amdgpu_vgpr' is not guaranteed to keep the variable in vector registers at -O0; it may fall back to scratch memory}}
- buf[2] = i;
- out[0] = buf[2];
-}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
new file mode 100644
index 0000000000000..b8618433055cb
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-O0.hip
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 \
+// RUN: -fcuda-is-device -O0 -emit-llvm -verify -o - %s | FileCheck %s
+//
+// "VGPR as memory" is honored at every optimization level (it is a global, not
+// an alloca that depends on the optimizing register allocator), so at -O0 the
+// variable is still emitted in addrspace(13) with no diagnostic.
+
+// expected-no-diagnostics
+
+#define __global__ __attribute__((global))
+
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
+// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+__global__ void kernel(int *out, int i) {
+ int buf[4] __attribute__((amdgpu_vgpr));
+ buf[2] = i;
+ out[0] = buf[2];
+}
diff --git a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
index 9a5c38e48951c..12a1c24284811 100644
--- a/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
+++ b/clang/test/CodeGenHIP/amdgpu-vgpr-attr.hip
@@ -4,14 +4,15 @@
#define __global__ __attribute__((global))
-// A kernel-local variable marked amdgpu_vgpr is allocated in the "VGPR as
-// memory" address space (addrspace(13)), and its accesses stay in that space.
+// A kernel-local variable marked amdgpu_vgpr is emitted as an internal global
+// in the "VGPR as memory" address space (addrspace(13)) with a poison
+// initializer (like an LDS/__shared__ variable), and its accesses stay in that
+// space.
+// CHECK: @{{.*}}buf = internal addrspace(13) global [4 x i32] poison, align 4
// CHECK-LABEL: define {{.*}}@_Z6kernelPii(
-// CHECK: %buf = alloca [4 x i32], align 4, addrspace(13)
-// CHECK: getelementptr inbounds [4 x i32], ptr addrspace(13) %buf
-// CHECK: store i32 %{{.*}}, ptr addrspace(13)
-// CHECK: load i32, ptr addrspace(13)
+// CHECK: store i32 %{{.*}}, ptr addrspace(13) {{.*}}@{{.*}}buf
+// CHECK: load i32, ptr addrspace(13) {{.*}}@{{.*}}buf
__global__ void kernel(int *out, int i) {
int buf[4] __attribute__((amdgpu_vgpr));
buf[2] = i;
diff --git a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
index 72ce72644b8ea..f120db1aaf6cd 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-env-amdgcn.cl
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -O0 -triple amdgcn -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 %s -O0 -triple amdgcn---opencl -emit-llvm -o - | FileCheck %s
-// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+// CHECK: target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
void foo(void) {}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8aad903f98561..916dfb9d3b70f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -982,7 +982,7 @@ supported for the ``amdgcn`` target.
*reserved for future use* 10
*reserved for future use* 11
*reserved for downstream use (LLPC)* 12
- *reserved for future use* 13
+ VGPR as memory 13 N/A VGPR 32 0xFFFFFFFF
*reserved for future use* 14
*reserved for future use* 16
Streamout Registers 128 N/A GS_REGS
@@ -1092,6 +1092,26 @@ supported for the ``amdgcn`` target.
When using code object V5 ``LIBOMPTARGET_STACK_SIZE`` may be used to provide the
private segment size in bytes, for cases where a dynamic stack is used.
+**VGPR as memory**
+ The "VGPR as memory" address space holds small objects directly in vector
+ registers instead of scratch (private) memory, avoiding memory traffic for
+ frequently accessed kernel-local data. Objects in this address space are
+ represented as global variables (similar to how *Local* memory uses LDS
+ global variables) and are backed by a block of physical VGPRs that is
+ reserved out of the register allocator for the duration of the function.
+
+ An address in this space is a register-relative dword index into the reserved
+ VGPR block, not a byte address into an addressable memory segment. A load or
+ store at a constant index lowers to a register copy to/from a fixed VGPR; a
+ load or store at a variable (dynamic) index lowers to a hardware register
+ indexing sequence. Sub-dword (8/16-bit) accesses are implemented as
+ read-modify-write of the containing dword.
+
+ Because the address is not a real memory address, ``addrspacecast`` to or from
+ this address space is rejected by the verifier, as is an initializer on such a
+ global variable. The numeric value 13 it uses coincides with the graphics-only
+ ``CONSTANT_BUFFER_5`` alias, which never co-exists with this feature.
+
**Constant 32-bit**
*TODO*
diff --git a/llvm/include/llvm/Support/AMDGPUAddrSpace.h b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
index e9d3add54d054..206caf8305c5d 100644
--- a/llvm/include/llvm/Support/AMDGPUAddrSpace.h
+++ b/llvm/include/llvm/Support/AMDGPUAddrSpace.h
@@ -96,14 +96,18 @@ namespace AMDGPU {
enum class FlatAddrSpace : unsigned { FLAT, FlatGlobal, FlatScratch };
inline bool isFlatGlobalAddrSpace(unsigned AS) {
+ // AMDGPUAS::VGPR ("VGPR as memory") is backed by registers, not a
+ // flat-addressable memory segment, so it must not be treated as global even
+ // though its numeric value is greater than MAX_AMDGPU_ADDRESS.
return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS || AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+ AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
}
inline bool isExtendedGlobalAddrSpace(unsigned AS) {
return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+ (AS > AMDGPUAS::MAX_AMDGPU_ADDRESS && AS != AMDGPUAS::VGPR);
}
inline bool isConstantAddressSpace(unsigned AS) {
@@ -185,6 +189,7 @@ constexpr int64_t getNullPointerValue(unsigned AS) {
case PRIVATE_ADDRESS:
case LOCAL_ADDRESS:
case REGION_ADDRESS:
+ case VGPR:
return -1;
default:
return 0;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3a823f906b012..c753e9e2bf56a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -6851,6 +6851,11 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
Res.replace(Res.find(OldP8), OldP8.size(), "-p8:128:128:128:48-");
if (!DL.contains("-p9") && !DL.starts_with("p9"))
Res.append("-p9:192:256:256:32");
+
+ // Add sizing for address space 13 ("VGPR as memory"), 32-bit
+ // register-relative indices.
+ if (!DL.contains("-p13") && !DL.starts_with("p13"))
+ Res.append("-p13:32:32");
}
// Upgrade the ELF mangling mode.
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 648446555793b..f8a8f94aed0ca 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -807,6 +807,9 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
"Global variable is too large to fit into the address space", &GV,
GVType);
+ // Target-specific global variable checks.
+ verifyAMDGPUGlobalVariable(*this, GV);
+
if (!GV.hasInitializer()) {
visitGlobalValue(GV);
return;
@@ -3738,6 +3741,10 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
Check(SrcVTy->getElementCount() ==
cast<VectorType>(DestTy)->getElementCount(),
"AddrSpaceCast vector pointer number of elements mismatch", &I);
+
+ // Target-specific addrspacecast checks.
+ verifyAMDGPUAddrSpaceCast(*this, I);
+
visitInstruction(I);
}
diff --git a/llvm/lib/IR/VerifierAMDGPU.cpp b/llvm/lib/IR/VerifierAMDGPU.cpp
index de9a0c7bef132..1043f0ddcb311 100644
--- a/llvm/lib/IR/VerifierAMDGPU.cpp
+++ b/llvm/lib/IR/VerifierAMDGPU.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Support/AMDGPUAddrSpace.h"
@@ -122,10 +123,40 @@ void llvm::verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI) {
if (!VS.TT.isAMDGPU())
return;
- if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
- AI.getAddressSpace() != AMDGPUAS::VGPR)
- VS.CheckFailed("alloca on amdgpu must be in addrspace(5) or addrspace(13)",
- &AI);
+ if (AI.getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+ VS.CheckFailed("alloca on amdgpu must be in addrspace(5)", &AI);
+}
+
+void llvm::verifyAMDGPUGlobalVariable(VerifierSupport &VS,
+ const GlobalVariable &GV) {
+ if (!VS.TT.isAMDGPU())
+ return;
+
+ if (GV.getAddressSpace() != AMDGPUAS::VGPR)
+ return;
+
+ // "VGPR as memory" objects are backed by registers, which have no defined
+ // initial contents, so (like LDS) they cannot be statically initialized: the
+ // only permitted initializer is an undef/poison placeholder (isa<UndefValue>
+ // also matches poison).
+ Check(!GV.hasInitializer() || isa<UndefValue>(GV.getInitializer()),
+ "global variable in the VGPR address space (13) cannot have an "
+ "initializer",
+ &GV);
+}
+
+void llvm::verifyAMDGPUAddrSpaceCast(VerifierSupport &VS,
+ const AddrSpaceCastInst &I) {
+ if (!VS.TT.isAMDGPU())
+ return;
+
+ // The VGPR address space (13) is register-backed and has no meaningful
+ // numeric address, so it cannot participate in addrspacecast.
+ unsigned SrcAS = I.getSrcAddressSpace();
+ unsigned DestAS = I.getDestAddressSpace();
+ Check(SrcAS != AMDGPUAS::VGPR && DestAS != AMDGPUAS::VGPR,
+ "addrspacecast to or from the VGPR address space (13) is not allowed",
+ &I);
}
bool llvm::isAMDGPUCallBrIntrinsic(Intrinsic::ID ID) {
diff --git a/llvm/lib/IR/VerifierInternal.h b/llvm/lib/IR/VerifierInternal.h
index 922385230179b..51d4c9eb9af21 100644
--- a/llvm/lib/IR/VerifierInternal.h
+++ b/llvm/lib/IR/VerifierInternal.h
@@ -221,6 +221,10 @@ void verifyAMDGPUFunctionMetadata(VerifierSupport &VS, const Function &F);
void verifyAMDGPUAlloca(VerifierSupport &VS, const AllocaInst &AI);
+void verifyAMDGPUGlobalVariable(VerifierSupport &VS, const GlobalVariable &GV);
+
+void verifyAMDGPUAddrSpaceCast(VerifierSupport &VS, const AddrSpaceCastInst &I);
+
void verifyAMDGPUIntrinsicCall(VerifierSupport &VS, Intrinsic::ID ID,
CallBase &Call);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 3336ea6d1f943..14c67d542b286 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -263,7 +263,7 @@ void initializeAMDGPUPreloadKernelArgumentsLegacyPass(PassRegistry &);
extern char &AMDGPUPreloadKernelArgumentsLegacyID;
// Passes common to R600 and SI
-FunctionPass *createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel);
+FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
extern char &AMDGPUPromoteAllocaID;
@@ -279,15 +279,12 @@ struct AMDGPUPromoteAllocaPass
void initializeAMDGPUPrivateObjectVGPRsPass(PassRegistry &);
extern char &AMDGPUPrivateObjectVGPRsID;
-// Allocates pre-existing VGPR address space allocas without performing any
-// optimization-oriented alloca promotion. Used at -O0 so that "VGPR as memory"
-// objects remain functional.
-struct AMDGPUVGPRAllocatePass : PassInfoMixin<AMDGPUVGPRAllocatePass> {
- AMDGPUVGPRAllocatePass(TargetMachine &TM) : TM(TM) {}
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+ModulePass *createAMDGPULowerModuleVGPRsPass();
+void initializeAMDGPULowerModuleVGPRsPass(PassRegistry &);
+extern char &AMDGPULowerModuleVGPRsID;
-private:
- TargetMachine &TM;
+struct AMDGPULowerModuleVGPRsPass : PassInfoMixin<AMDGPULowerModuleVGPRsPass> {
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
struct AMDGPUPromoteAllocaToVectorPass
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 66c9353cd5c33..7330f3b13f3cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -21,10 +21,8 @@
#include "R600RegisterInfo.h"
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
-#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -68,8 +66,7 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
SDValue Srl = In.getOperand(0);
if (Srl.getOpcode() == ISD::SRL) {
- if (ConstantSDNode *ShiftAmt =
- dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
+ if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
if (ShiftAmt->getZExtValue() == 16) {
Out = stripBitcast(Srl.getOperand(0));
return true;
@@ -285,20 +282,22 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
- SDValue Ops[] = {LdHi->getChain(), LdHi->getBasePtr(), TiedIn};
+ SDValue Ops[] = {
+ LdHi->getChain(), LdHi->getBasePtr(), TiedIn
+ };
unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
if (LdHi->getMemoryVT() == MVT::i8) {
- LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD
- ? AMDGPUISD::LOAD_D16_HI_I8
- : AMDGPUISD::LOAD_D16_HI_U8;
+ LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
} else {
assert(LdHi->getMemoryVT() == MVT::i16);
}
SDValue NewLoadHi =
- CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList, Ops,
- LdHi->getMemoryVT(), LdHi->getMemOperand());
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
+ Ops, LdHi->getMemoryVT(),
+ LdHi->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
@@ -317,20 +316,22 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
if (LdLo->getMemoryVT() == MVT::i8) {
- LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD
- ? AMDGPUISD::LOAD_D16_LO_I8
- : AMDGPUISD::LOAD_D16_LO_U8;
+ LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
+ AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
} else {
assert(LdLo->getMemoryVT() == MVT::i16);
}
TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
- SDValue Ops[] = {LdLo->getChain(), LdLo->getBasePtr(), TiedIn};
+ SDValue Ops[] = {
+ LdLo->getChain(), LdLo->getBasePtr(), TiedIn
+ };
SDValue NewLoadLo =
- CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList, Ops,
- LdLo->getMemoryVT(), LdLo->getMemOperand());
+ CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
+ Ops, LdLo->getMemoryVT(),
+ LdLo->getMemOperand());
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
@@ -341,29 +342,31 @@ bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
}
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
- bool MadeChange = false;
+ if (!Subtarget->d16PreservesUnusedBits())
+ return;
- if (Subtarget->d16PreservesUnusedBits()) {
- SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
- while (Position != CurDAG->allnodes_begin()) {
- SDNode *N = &*--Position;
- if (N->use_empty())
- continue;
+ SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
- switch (N->getOpcode()) {
- case ISD::BUILD_VECTOR:
- // TODO: Match load d16 from shl (extload:i16), 16
- MadeChange |= matchLoadD16FromBuildVector(N);
- break;
- default:
- break;
- }
+ bool MadeChange = false;
+ while (Position != CurDAG->allnodes_begin()) {
+ SDNode *N = &*--Position;
+ if (N->use_empty())
+ continue;
+
+ switch (N->getOpcode()) {
+ case ISD::BUILD_VECTOR:
+ // TODO: Match load d16 from shl (extload:i16), 16
+ MadeChange |= matchLoadD16FromBuildVector(N);
+ break;
+ default:
+ break;
}
}
if (MadeChange) {
CurDAG->RemoveDeadNodes();
- LLVM_DEBUG(dbgs() << "After PreProcess:\n"; CurDAG->dump(););
+ LLVM_DEBUG(dbgs() << "After PreProcess:\n";
+ CurDAG->dump(););
}
}
@@ -385,8 +388,8 @@ bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
/// \returns The register class of the virtual register that will be used for
/// the given operand number \OpNo or NULL if the register class cannot be
/// determined.
-const TargetRegisterClass *
-AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+ unsigned OpNo) const {
if (!N->isMachineOpcode()) {
if (N->getOpcode() == ISD::CopyToReg) {
Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
@@ -424,14 +427,14 @@ AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N, unsigned OpNo) const {
SDValue SubRegOp = N->getOperand(OpNo + 1);
unsigned SubRegIdx = SubRegOp->getAsZExtVal();
return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
- SubRegIdx);
+ SubRegIdx);
}
}
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
SDValue Glue) const {
- SmallVector<SDValue, 8> Ops;
+ SmallVector <SDValue, 8> Ops;
Ops.push_back(NewChain); // Replace the chain.
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));
@@ -441,8 +444,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
}
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
- const SITargetLowering &Lowering =
- *static_cast<const SITargetLowering *>(getTargetLowering());
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
@@ -459,8 +462,8 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
} else if (AS == AMDGPUAS::REGION_ADDRESS) {
MachineFunction &MF = CurDAG->getMachineFunction();
unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
- return glueCopyToM0(N,
- CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
+ return
+ glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
}
return N;
}
@@ -542,7 +545,7 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
}
assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
- "supported yet");
+ "supported yet");
// 32 = Max Num Vector Elements
// 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
// 1 = Vector Register Class
@@ -568,8 +571,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
if (NOps != NumVectorElts) {
// Fill in the missing undef elements if this was a scalar_to_vector.
assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
- MachineSDNode *ImpDef =
- CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, EltVT);
+ MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ DL, EltVT);
for (unsigned i = NOps; i < NumVectorElts; ++i) {
unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(
i * EltSizeInRegs, EltSizeInRegs)
@@ -678,7 +681,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
N->setNodeId(-1);
- return; // Already selected.
+ return; // Already selected.
}
// isa<MemSDNode> almost works but is slightly too permissive for some DS
@@ -766,8 +769,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
} else {
llvm_unreachable("Unhandled value type for BUILD_PAIR");
}
- const SDValue Ops[] = {RC, N->getOperand(0), SubReg0, N->getOperand(1),
- SubReg1};
+ const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+ N->getOperand(1), SubReg1 };
ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops));
return;
@@ -818,8 +821,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
uint32_t OffsetVal = Offset->getZExtValue();
uint32_t WidthVal = Width->getZExtValue();
- ReplaceNode(
- N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
+ ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
+ WidthVal));
return;
}
case AMDGPUISD::DIV_SCALE: {
@@ -835,8 +838,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::UMUL_LOHI:
return SelectMUL_LOHI(N);
case ISD::CopyToReg: {
- const SITargetLowering &Lowering =
- *static_cast<const SITargetLowering *>(getTargetLowering());
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
break;
}
@@ -864,7 +867,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (N->getValueType(0) == MVT::i32) {
MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
- {N->getOperand(0), N->getOperand(1)});
+ { N->getOperand(0), N->getOperand(1) });
SelectCode(N);
return;
}
@@ -1027,7 +1030,7 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
- (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+ (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
Base = Addr.getOperand(0);
Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
} else {
@@ -1040,9 +1043,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
const SDLoc &DL) const {
- SDNode *Mov =
- CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
- CurDAG->getTargetConstant(Val, DL, MVT::i32));
+ SDNode *Mov = CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(Val, DL, MVT::i32));
return SDValue(Mov, 0);
}
@@ -1183,8 +1186,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
}
void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
- // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
- // omod
+ // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
SDValue Ops[10];
SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
@@ -1222,8 +1224,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
assert(VT == MVT::f32 || VT == MVT::f64);
- unsigned Opc = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64
- : AMDGPU::V_DIV_SCALE_F32_e64;
+ unsigned Opc
+ = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
// omod
@@ -1250,7 +1252,8 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
- SDValue Ops[] = {N->getOperand(0), N->getOperand(1), N->getOperand(2), Clamp};
+ SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+ Clamp };
if (UseNoCarry) {
MachineSDNode *Mad = CurDAG->getMachineNode(Opc, SL, MVT::i64, Ops);
@@ -1329,8 +1332,7 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
}
} else if (Addr.getOpcode() == ISD::SUB) {
// sub C, x -> add (sub 0, x), C
- if (const ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
+ if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
int64_t ByteOffset = C->getSExtValue();
if (isDSOffsetLegal(SDValue(), ByteOffset)) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1338,8 +1340,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
// XXX - This is kind of hacky. Create a dummy sub node so we can check
// the known bits in isDSOffsetLegal. We need to emit the selected node
// here, so this is thrown away.
- SDValue Sub =
- CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
+ SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
+ Zero, Addr.getOperand(1));
if (isDSOffsetLegal(Sub, ByteOffset)) {
SmallVector<SDValue, 3> Opnds;
@@ -1373,8 +1375,8 @@ bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
- MachineSDNode *MovZero =
- CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
+ MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ DL, MVT::i32, Zero);
Base = SDValue(MovZero, 0);
Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
return true;
@@ -1679,8 +1681,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
if (C->getSExtValue()) {
SDLoc DL(Addr);
- const SITargetLowering &Lowering =
- *static_cast<const SITargetLowering *>(getTargetLowering());
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
return true;
@@ -1689,8 +1691,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
}
-std::pair<SDValue, SDValue>
-AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
SDLoc DL(N);
auto *FI = dyn_cast<FrameIndexSDNode>(N);
@@ -1704,9 +1705,9 @@ AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
return std::pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
- SDValue &Rsrc, SDValue &VAddr,
- SDValue &SOffset,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
+ SDValue Addr, SDValue &Rsrc,
+ SDValue &VAddr, SDValue &SOffset,
SDValue &ImmOffset) const {
SDLoc DL(Addr);
@@ -1724,8 +1725,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, SDValue Addr,
const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
SDValue HighBits =
CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
- MachineSDNode *MovHighBits =
- CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+ MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+ AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -1781,7 +1782,8 @@ static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
return RC && TRI.isSGPRClass(RC);
}
-bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
+ SDValue Addr,
SDValue &SRsrc,
SDValue &SOffset,
SDValue &Offset) const {
@@ -1824,8 +1826,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SDValue Addr,
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
- SDValue &SOffset,
- SDValue &Offset) const {
+ SDValue &SOffset, SDValue &Offset
+ ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1839,8 +1841,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
maskTrailingOnes<uint64_t>(32); // Size
SDLoc DL(Addr);
- const SITargetLowering &Lowering =
- *static_cast<const SITargetLowering *>(getTargetLowering());
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
return true;
@@ -1861,14 +1863,14 @@ bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode *findMemSDNode(SDNode *N) {
- N = AMDGPUTargetLowering::stripBitcast(SDValue(N, 0)).getNode();
+static MemSDNode* findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
return MN;
assert(isa<BuildVectorSDNode>(N));
for (SDValue V : N->op_values())
if (MemSDNode *MN =
- dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
return MN;
llvm_unreachable("cannot find MemSDNode in the pattern!");
}
@@ -2249,8 +2251,8 @@ static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
// Materialize this into a scalar move for scalar address to avoid
// readfirstlane.
auto *FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
- SDValue TFI =
- CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+ SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+ FI->getValueType(0));
SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
MVT::i32, TFI, SAddr.getOperand(1)),
0);
@@ -2350,8 +2352,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
if (isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
- AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
- CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
+ AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
+ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
if (!isFlatScratchBaseLegal(Addr))
@@ -2539,16 +2541,16 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
const SDValue Ops[] = {
- CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
- Addr,
- CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
- SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
- 0),
- CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
+ CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
+ Addr,
+ CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
+ SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
+ 0),
+ CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
};
- return SDValue(
- CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, Ops), 0);
+ return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
+ Ops), 0);
}
// Match a base and an immediate (if Offset is not null) or an SGPR (if
@@ -2684,7 +2686,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
-bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, SDValue &Base,
+bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
+ SDValue &Base,
SDValue &Offset) const {
SDLoc DL(Index);
@@ -2749,7 +2752,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
if (0 < BVal && BVal <= CVal && CVal < 32) {
bool Signed = N->getOpcode() == ISD::SRA;
ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
- 32 - CVal));
+ 32 - CVal));
return;
}
}
@@ -2794,7 +2797,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
if (isMask_32(MaskVal)) {
uint32_t WidthVal = llvm::popcount(MaskVal);
ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
- WidthVal));
+ WidthVal));
return;
}
}
@@ -2889,8 +2892,8 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);
if (Cond.isUndef()) {
- CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other, N->getOperand(2),
- N->getOperand(0));
+ CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
+ N->getOperand(2), N->getOperand(0));
return;
}
@@ -2990,8 +2993,8 @@ void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
- unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? AMDGPU::DS_APPEND
- : AMDGPU::DS_CONSUME;
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+ AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
SDValue Chain = N->getOperand(0);
SDValue Ptr = N->getOperand(2);
@@ -3017,8 +3020,10 @@ void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
}
SDValue Ops[] = {
- Offset, CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), Chain,
- N->getOperand(N->getNumOperands() - 1) // New glue
+ Offset,
+ CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+ Chain,
+ N->getOperand(N->getNumOperands() - 1) // New glue
};
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
@@ -3141,12 +3146,14 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
// Prefer to do the shift in an SGPR since it should be possible to use m0
// as the result directly. If it's already an SGPR, it will be eliminated
// later.
- SDNode *SGPROffset = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
- MVT::i32, BaseOffset);
+ SDNode *SGPROffset
+ = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+ BaseOffset);
// Shift to offset in m0
- SDNode *M0Base = CurDAG->getMachineNode(
- AMDGPU::S_LSHL_B32, SL, MVT::i32, SDValue(SGPROffset, 0),
- CurDAG->getTargetConstant(16, SL, MVT::i32));
+ SDNode *M0Base
+ = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+ SDValue(SGPROffset, 0),
+ CurDAG->getTargetConstant(16, SL, MVT::i32));
glueCopyToM0(N, SDValue(M0Base, 0));
}
@@ -3226,27 +3233,27 @@ void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
- SDNode *InterpMov = CurDAG->getMachineNode(
- AMDGPU::V_INTERP_MOV_F32, DL, VTs,
- {
- CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
- N->getOperand(3), // Attr
- N->getOperand(2), // Attrchan
- ToM0.getValue(1) // In glue
- });
-
- SDNode *InterpP1LV = CurDAG->getMachineNode(
- AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32,
- {CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
- N->getOperand(1), // Src0
- N->getOperand(3), // Attr
- N->getOperand(2), // Attrchan
- CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
- SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
- N->getOperand(4), // high
- CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
- CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
- SDValue(InterpMov, 1)});
+ SDNode *InterpMov =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
+ CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ ToM0.getValue(1) // In glue
+ });
+
+ SDNode *InterpP1LV =
+ CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+ N->getOperand(1), // Src0
+ N->getOperand(3), // Attr
+ N->getOperand(2), // Attrchan
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+ SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+ N->getOperand(4), // high
+ CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+ CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+ SDValue(InterpMov, 1)
+ });
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
}
@@ -3373,8 +3380,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
}
void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
- SDValue Log2WaveSize = CurDAG->getTargetConstant(
- Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
+ SDValue Log2WaveSize =
+ CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
{N->getOperand(0), Log2WaveSize});
}
@@ -3647,14 +3654,14 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
if (Lo.getValueSizeInBits() > VecSize) {
Lo = CurDAG->getTargetExtractSubreg(
- (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
- MVT::getIntegerVT(VecSize), Lo);
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Lo);
}
if (Hi.getValueSizeInBits() > VecSize) {
Hi = CurDAG->getTargetExtractSubreg(
- (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
- MVT::getIntegerVT(VecSize), Hi);
+ (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
+ MVT::getIntegerVT(VecSize), Hi);
}
assert(Lo.getValueSizeInBits() <= VecSize &&
@@ -3694,18 +3701,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
TRI->getSubRegFromChannel(NumRegs, NumRegs), SL, MVT::i32)};
Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
- Src.getValueType(), Ops),
- 0);
+ Src.getValueType(), Ops), 0);
}
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
- uint64_t Lit = cast<ConstantFPSDNode>(Lo)
- ->getValueAPF()
- .bitcastToAPInt()
- .getZExtValue();
+ uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
+ .bitcastToAPInt().getZExtValue();
if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
@@ -4387,7 +4391,7 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
// 1 0 1
// 1 1 0
// 1 1 1
- const uint8_t SrcBits[3] = {0xf0, 0xcc, 0xaa};
+ const uint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
if (C->isAllOnes()) {
@@ -4448,7 +4452,8 @@ static std::pair<unsigned, uint8_t> BitOp3_Op(SDValue In,
SDValue RHS = In.getOperand(1);
SmallVector<SDValue, 3> Backup(Src.begin(), Src.end());
- if (!getOperandBits(LHS, LHSBits) || !getOperandBits(RHS, RHSBits)) {
+ if (!getOperandBits(LHS, LHSBits) ||
+ !getOperandBits(RHS, RHSBits)) {
Src = std::move(Backup);
return std::make_pair(0, 0);
}
@@ -4642,7 +4647,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
SDLoc SL(In);
return CurDAG->getConstant(
- C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
+ C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
}
SDValue Src;
@@ -4652,7 +4657,7 @@ SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
return SDValue();
}
-bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
+bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
assert(CurDAG->getTarget().getTargetTriple().isAMDGCN());
const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
@@ -4661,7 +4666,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode *N) const {
unsigned Limit = 0;
bool AllUsesAcceptSReg = true;
for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
- Limit < 10 && U != E; ++U, ++Limit) {
+ Limit < 10 && U != E; ++U, ++Limit) {
const TargetRegisterClass *RC =
getOperandRegClass(U->getUser(), U->getOperandNo());
@@ -4731,8 +4736,8 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
}
void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
- const AMDGPUTargetLowering &Lowering =
- *static_cast<const AMDGPUTargetLowering *>(getTargetLowering());
+ const AMDGPUTargetLowering& Lowering =
+ *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
bool IsModified = false;
do {
IsModified = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a06c15594bf0a..95f85a6151375 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -273,7 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &SrcMods) const;
bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2,
- SDValue &Tbl) const;
+ SDValue &Tbl) const;
SDValue getHi16Elt(SDValue In) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
new file mode 100644
index 0000000000000..5e4a0914a4366
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleVGPRs.cpp
@@ -0,0 +1,254 @@
+//===- AMDGPULowerModuleVGPRs.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lays out the "VGPR as memory" (addrspace(13)) globals of a module into a
+// single shared register "file" and records, on every function that takes part
+// in a call graph which uses the file, where that file lives.
+//
+// Unlike ordinary memory, the file is backed by a fixed block of physical
+// VGPRs. For an address into the file to be meaningful across a call (i.e. for
+// it to be passed between functions), every function in the call graph must
+// agree on (a) the byte offset of each global within the file and (b) the
+// physical register the file starts at. Computing these per function (as the
+// backend does on its own) does not satisfy (b), because the natural low base
+// sits just above each function's ABI input registers, which differ. This pass
+// resolves both module-wide:
+//
+// * Offsets: all addrspace(13) globals are packed into one layout, in a
+// deterministic order, and each global's byte offset is recorded as
+// "amdgpu.vgpr.memory.offset" metadata.
+//
+// * Base: a single base register index, chosen as the maximum ABI-input VGPR
+// boundary over all participating functions, so it is at or above every
+// function's own inputs and is therefore identical everywhere. It is kept as
+// low as that maximum allows so occupancy is preserved.
+//
+// Both the total file size and the shared base are attached as the
+// "amdgpu-vgpr-memory-size" and "amdgpu-vgpr-memory-base" function attributes
+// to every function whose call graph uses the file (the file behaves like LDS:
+// it is live for a using kernel's entire execution, so all reachable functions
+// must reserve it). The backend consumes these:
+// - SIISelLowering reads the per-global offset metadata.
+// - SIMachineFunctionInfo reads the size/base attributes.
+// - SIRegisterInfo::getVGPRMemoryFile reserves [base, base + size).
+//
+// TODO: A single module-wide layout means every using function reserves all
+// addrspace(13) globals, and a function reachable from several kernels reserves
+// the file even when called from a kernel that does not use it. A per-kernel
+// layout (as AMDGPULowerModuleLDS does, with a table for shared callees) would
+// tighten this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-lower-module-vgprs"
+
+namespace {
+
+constexpr char SizeAttr[] = "amdgpu-vgpr-memory-size";
+constexpr char BaseAttr[] = "amdgpu-vgpr-memory-base";
+constexpr char OffsetMD[] = "amdgpu.vgpr.memory.offset";
+
+// Upper bound on the number of VGPRs occupied by a function's ABI inputs (the
+// registers the shared file must sit above).
+static unsigned inputVGPRBound(const Function &F) {
+ // Compute kernels take their arguments in the kernarg segment (SGPRs/memory),
+ // not VGPRs. Their only VGPR input is the workitem ID, which AMDGPU packs
+ // into a single register (v0), so it occupies at most one VGPR regardless of
+ // how many dimensions are used.
+ if (AMDGPU::isKernel(F.getCallingConv())) {
+ bool UsesWorkitemID = !F.hasFnAttribute("amdgpu-no-workitem-id-x") ||
+ !F.hasFnAttribute("amdgpu-no-workitem-id-y") ||
+ !F.hasFnAttribute("amdgpu-no-workitem-id-z");
+ return UsesWorkitemID ? 1 : 0;
+ }
+
+ // Graphics entry points and ordinary functions pass their arguments in VGPRs
+ // (except inreg arguments, which go in SGPRs).
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ unsigned N = 0;
+ for (const Argument &A : F.args()) {
+ if (A.hasAttribute(Attribute::InReg))
+ continue;
+ N += divideCeil(DL.getTypeAllocSize(A.getType()).getFixedValue(), 4u);
+ }
+ return N;
+}
+
+class AMDGPULowerModuleVGPRs : public ModulePass {
+public:
+ static char ID;
+ AMDGPULowerModuleVGPRs() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override;
+
+ StringRef getPassName() const override { return "AMDGPU Lower Module VGPRs"; }
+};
+
+} // end anonymous namespace
+
+char AMDGPULowerModuleVGPRs::ID = 0;
+char &llvm::AMDGPULowerModuleVGPRsID = AMDGPULowerModuleVGPRs::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleVGPRs, DEBUG_TYPE, "AMDGPU Lower Module VGPRs",
+ false, false)
+
+ModulePass *llvm::createAMDGPULowerModuleVGPRsPass() {
+ return new AMDGPULowerModuleVGPRs();
+}
+
+static bool lowerModuleVGPRs(Module &M) {
+ // Collect the addrspace(13) globals.
+ SmallVector<GlobalVariable *, 8> Globals;
+ for (GlobalVariable &GV : M.globals())
+ if (GV.getAddressSpace() == AMDGPUAS::VGPR)
+ Globals.push_back(&GV);
+ if (Globals.empty())
+ return false;
+
+ // Map each function to the addrspace(13) globals it directly references.
+ DenseMap<Function *, SmallVector<GlobalVariable *, 2>> Uses;
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ SmallPtrSet<GlobalVariable *, 4> Seen;
+ for (Instruction &I : instructions(F))
+ for (Value *Op : I.operands()) {
+ // getUnderlyingObject sees through (constant-expression) GEPs and
+ // casts, so a global referenced via e.g. `getelementptr(@g, off)` is
+ // found.
+ auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Op));
+ if (GV && GV->getAddressSpace() == AMDGPUAS::VGPR &&
+ Seen.insert(GV).second)
+ Uses[&F].push_back(GV);
+ }
+ }
+ if (Uses.empty())
+ return true; // nothing references the file
+
+ CallGraph CG(M);
+ auto Reachable = [&](Function *Root, SmallPtrSetImpl<Function *> &Out) {
+ SmallVector<Function *, 16> Work{Root};
+ while (!Work.empty()) {
+ Function *F = Work.pop_back_val();
+ if (!Out.insert(F).second)
+ continue;
+ if (CallGraphNode *N = CG[F])
+ for (auto &CR : *N)
+ if (Function *Callee = CR.second->getFunction())
+ if (!Callee->isDeclaration())
+ Work.push_back(Callee);
+ }
+ };
+
+ // Partition functions and globals into independent layout groups. The file is
+ // live for a using kernel's whole execution (like LDS), so a group must cover
+ // everything reachable from such a kernel; and a global must share a layout
+ // with every function that uses it. Disjoint kernels therefore land in
+ // separate groups and get independent (low, occupancy-friendly) bases, while
+ // data- or call-graph-shared functions stay in one consistent group.
+ //
+ // Functions and globals are both GlobalValues, so one union-find covers both.
+ EquivalenceClasses<const GlobalValue *> Groups;
+ for (auto &[F, GVs] : Uses)
+ for (GlobalVariable *GV : GVs)
+ Groups.unionSets(F, GV);
+
+ // Functions reachable from each file-using kernel join that kernel's group
+ // (so they reserve the file), and kernels sharing any callee merge.
+ for (Function &K : M) {
+ if (K.isDeclaration() || !AMDGPU::isEntryFunctionCC(K.getCallingConv()))
+ continue;
+ SmallPtrSet<Function *, 16> R;
+ Reachable(&K, R);
+ if (llvm::none_of(R, [&](Function *F) { return Uses.count(F); }))
+ continue; // this kernel does not use the file
+ for (Function *F : R)
+ Groups.unionSets(&K, F);
+ }
+
+ const DataLayout &DL = M.getDataLayout();
+ LLVMContext &Ctx = M.getContext();
+ Type *I32 = Type::getInt32Ty(Ctx);
+
+ // Lay out each group independently.
+ for (auto It = Groups.begin(), E = Groups.end(); It != E; ++It) {
+ const auto *Leader = *It;
+ if (!Leader->isLeader())
+ continue;
+ SmallVector<GlobalVariable *, 8> GroupGlobals;
+ SmallVector<Function *, 16> GroupFns;
+ for (auto MI = Groups.member_begin(*Leader); MI != Groups.member_end();
+ ++MI) {
+ const GlobalValue *GV = *MI;
+ if (auto *G = dyn_cast<GlobalVariable>(GV))
+ GroupGlobals.push_back(const_cast<GlobalVariable *>(G));
+ else
+ GroupFns.push_back(const_cast<Function *>(cast<Function>(GV)));
+ }
+ if (GroupGlobals.empty() || GroupFns.empty())
+ continue;
+
+ // Deterministic packed layout (sorted by name).
+ llvm::stable_sort(GroupGlobals, [](GlobalVariable *A, GlobalVariable *B) {
+ return A->getName() < B->getName();
+ });
+ unsigned Size = 0;
+ for (GlobalVariable *GV : GroupGlobals) {
+ Align A = std::max(
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType()),
+ Align(4));
+ unsigned Offset = alignTo(Size, A);
+ GV->setMetadata(OffsetMD,
+ MDNode::get(Ctx, {ConstantAsMetadata::get(
+ ConstantInt::get(I32, Offset))}));
+ Size = Offset + DL.getTypeAllocSize(GV->getValueType()).getFixedValue();
+ }
+
+ // One base for the group: above every member's ABI inputs, even-aligned.
+ unsigned Base = 0;
+ for (Function *F : GroupFns)
+ Base = std::max(Base, inputVGPRBound(*F));
+ Base = alignTo(Base, 2u);
+
+ for (Function *F : GroupFns) {
+ F->addFnAttr(SizeAttr, utostr(Size));
+ F->addFnAttr(BaseAttr, utostr(Base));
+ }
+ }
+ return true;
+}
+
+bool AMDGPULowerModuleVGPRs::runOnModule(Module &M) {
+ return lowerModuleVGPRs(M);
+}
+
+PreservedAnalyses AMDGPULowerModuleVGPRsPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ return lowerModuleVGPRs(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index b377704c2f296..028406085fd7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -23,6 +23,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
MODULE_PASS("amdgpu-lower-intrinsics", AMDGPULowerIntrinsicsPass(*this))
MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-lower-module-vgprs", AMDGPULowerModuleVGPRsPass())
MODULE_PASS("amdgpu-perf-hint",
AMDGPUPerfHintAnalysisPass(
*static_cast<const GCNTargetMachine *>(this)))
@@ -67,7 +68,6 @@ FUNCTION_PASS("amdgpu-lower-kernel-attributes",
FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
AMDGPUPromoteAllocaToVectorPass(*this))
-FUNCTION_PASS("amdgpu-vgpr-allocate", AMDGPUVGPRAllocatePass(*this))
FUNCTION_PASS("amdgpu-promote-kernel-arguments",
AMDGPUPromoteKernelArgumentsPass())
FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
index d8ff923619193..7dbc3e4f79690 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrivateObjectVGPRs.cpp
@@ -8,29 +8,33 @@
//
/// \file
/// Lowers the SI_VGPR_FRAME_{LOAD,STORE} pseudos produced for "VGPR as memory"
-/// objects (allocas in AMDGPUAS::VGPR) into register copies into/out of a
-/// virtual VGPR tuple that backs the per-function VGPR file. Each pseudo
-/// carries a constant byte offset, which selects the dword (subregister) to
-/// copy.
+/// objects (AMDGPUAS::VGPR / addrspace(13)) into register copies to/from a
+/// block of physical VGPRs that backs the per-function VGPR file. Each pseudo
+/// carries a constant dword index into the file.
///
-/// This runs once the function is out of SSA form (so the single backing tuple
-/// can be defined by several subregister copies) and while LiveIntervals is
-/// available. The backing tuple has lane-divergent liveness (its subregisters
-/// are written and read independently), which the whole-register LiveVariables
-/// analysis cannot represent; the pass therefore updates the subregister-aware
-/// LiveIntervals directly.
+/// A load is simply a COPY from the file register and a store is a COPY to it;
+/// the storage is persistent for the whole function like LDS. The file occupies
+/// a fixed block of physical VGPRs (SIRegisterInfo::getVGPRMemoryFile) that is
+/// reserved out of allocation (SIRegisterInfo::getReservedRegs) and accounted
+/// for in the VGPR count (AMDGPUResourceUsageAnalysis). It sits at the low end
+/// of the VGPR file, just above the ABI inputs, at a base that
+/// AMDGPULowerModuleVGPRs shares across a call graph (so an address resolves to
+/// the same registers in every function), keeping it low enough that the file
+/// costs only its own size rather than pinning occupancy. This pass runs after
+/// register allocation; until then the access pseudos behave as opaque memory
+/// operations, so register allocation is free to use any other register for the
+/// surrounding code.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
using namespace llvm;
@@ -96,9 +100,6 @@ class AMDGPUPrivateObjectVGPRs : public MachineFunctionPass {
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addRequired<LiveIntervalsWrapperPass>();
- AU.addPreserved<LiveIntervalsWrapperPass>();
- AU.addPreserved<SlotIndexesWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
@@ -118,80 +119,49 @@ bool AMDGPUPrivateObjectVGPRs::runOnMachineFunction(MachineFunction &MF) {
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- // Collect the pseudos and determine how many dwords the backing tuple needs.
- // Each pseudo carries a constant dword index and accesses as many dwords as
- // its data register class is wide.
- SmallVector<MachineInstr *, 8> Worklist;
- unsigned NumDwords = 0;
+ // The file occupies a fixed block of physical VGPRs (see
+ // SIRegisterInfo::getVGPRMemoryFile), which is already reserved out of
+ // allocation by getReservedRegs. Because the registers are reserved, they are
+ // exempt from liveness checks and need no explicit definition, and because
+ // the location is fixed (not function-local), references in different
+ // functions of a call graph resolve to the same physical registers.
+ auto [BaseIdx, FileDwords] = TRI->getVGPRMemoryFile(MF);
+ if (FileDwords == 0)
+ return false;
+
+ const TargetRegisterClass &VGPR32 = AMDGPU::VGPR_32RegClass;
+ bool Changed = false;
for (MachineBasicBlock &MBB : MF) {
- for (MachineInstr &MI : MBB) {
+ for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
unsigned Opc = MI.getOpcode();
- if (!isVGPRFrameLoad(Opc) && !isVGPRFrameStore(Opc))
+ bool IsLoad = isVGPRFrameLoad(Opc);
+ if (!IsLoad && !isVGPRFrameStore(Opc))
continue;
- unsigned Dword = MI.getOperand(1).getImm();
- unsigned AccessDwords =
- TRI->getRegSizeInBits(*MRI.getRegClass(MI.getOperand(0).getReg())) /
- 32;
- NumDwords = std::max(NumDwords, Dword + AccessDwords);
- Worklist.push_back(&MI);
- }
- }
- if (Worklist.empty())
- return false;
-
- LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
-
- const TargetRegisterClass *RC = TRI->getVGPRClassForBitWidth(NumDwords * 32);
- assert(RC && "no VGPR register class for VGPR-as-memory object");
- Register Storage = MRI.createVirtualRegister(RC);
-
- // Define the whole tuple up front so partial (subregister) writes and reads
- // of uninitialized lanes are well formed.
- MachineBasicBlock &Entry = MF.front();
- MachineInstr *ImpDef = BuildMI(Entry, Entry.begin(), DebugLoc(),
- TII->get(TargetOpcode::IMPLICIT_DEF), Storage);
- LIS->InsertMachineInstrInMaps(*ImpDef);
-
- for (MachineInstr *MI : Worklist) {
- MachineBasicBlock &MBB = *MI->getParent();
- const DebugLoc &DL = MI->getDebugLoc();
- unsigned Dword = MI->getOperand(1).getImm();
- unsigned AccessDwords =
- TRI->getRegSizeInBits(*MRI.getRegClass(MI->getOperand(0).getReg())) /
- 32;
- unsigned SubReg =
- (Dword == 0 && AccessDwords == NumDwords)
- ? AMDGPU::NoSubRegister
- : SIRegisterInfo::getSubRegFromChannel(Dword, AccessDwords);
-
- MachineInstr *Copy;
- if (isVGPRFrameLoad(MI->getOpcode())) {
- Register Dst = MI->getOperand(0).getReg();
- Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY), Dst)
- .addReg(Storage, {}, SubReg);
- } else {
- Register Src = MI->getOperand(0).getReg();
- Copy = BuildMI(MBB, *MI, DL, TII->get(TargetOpcode::COPY))
- .addReg(Storage, RegState::Define, SubReg)
- .addReg(Src);
+ const DebugLoc &DL = MI.getDebugLoc();
+ unsigned Dword = MI.getOperand(1).getImm();
+ Register Data = MI.getOperand(0).getReg();
+ unsigned AccessDwords = TRI->getRegSizeInBits(Data, MRI) / 32;
+
+ // Physical (sub)register backing this access within the file.
+ MCRegister Phys = VGPR32.getRegister(BaseIdx + Dword);
+ if (AccessDwords != 1) {
+ const TargetRegisterClass *RC =
+ TRI->getVGPRClassForBitWidth(AccessDwords * 32);
+ Phys = TRI->getMatchingSuperReg(Phys, AMDGPU::sub0, RC);
+ assert(Phys &&
+ "no aligned physical VGPR tuple for VGPR-as-memory access");
+ }
+
+ if (IsLoad)
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), Data).addReg(Phys);
+ else
+ BuildMI(MBB, MI, DL, TII->get(TargetOpcode::COPY), Phys).addReg(Data);
+
+ MI.eraseFromParent();
+ Changed = true;
}
- // The copy takes the pseudo's slot, so the intervals of the copied
- // load/store operand stay valid.
- LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
- MI->eraseFromParent();
}
- // The backing tuple is brand new; compute its (subregister) live interval.
- LiveInterval &LI = LIS->createAndComputeVirtRegInterval(Storage);
-
- // Independent dwords (and the entry IMPLICIT_DEF for never-written lanes)
- // form disconnected value-number components within the single tuple, which an
- // individual live interval must not contain. Split them into separate
- // virtual registers, exactly as the register coalescer does for the intervals
- // it leaves behind.
- SmallVector<LiveInterval *, 4> SplitLIs;
- LIS->splitSeparateComponents(LI, SplitLIs);
-
- return true;
+ return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 478f54b7cdfc3..95e06dc8295d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -35,7 +35,6 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -139,7 +138,6 @@ class AMDGPUPromoteAllocaImpl {
unsigned MaxVGPRs;
unsigned VGPRBudgetRatio;
unsigned MaxVectorRegs;
- unsigned AllocVGPROffset = 0;
bool IsAMDGCN = false;
bool IsAMDHSA = false;
@@ -164,10 +162,6 @@ class AMDGPUPromoteAllocaImpl {
void analyzePromoteToVector(AllocaAnalysis &AA) const;
void promoteAllocaToVector(AllocaAnalysis &AA);
void analyzePromoteToLDS(AllocaAnalysis &AA) const;
-
- /// Allocate an alloca that already lives in the VGPR address space to a range
- /// of VGPRs, recording the allocation in !amdgpu.allocated.vgprs metadata.
- void allocateVgprs(AllocaAnalysis &AA);
bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
SetVector<IntrinsicInst *> &DeferredIntrs);
void
@@ -185,11 +179,7 @@ class AMDGPUPromoteAllocaImpl {
IsAMDHSA = TT.getOS() == Triple::AMDHSA;
}
- /// IsLatePass is true when invoked as a codegen pass and false when invoked
- /// from the optimization pipeline ("amdgpu-promote-alloca-to-vector"). NoOpt
- /// requests only the work strictly required for functionality (i.e. VGPR
- /// allocation), skipping the optimization-oriented promotions.
- bool run(Function &F, bool IsLatePass, bool NoOpt);
+ bool run(Function &F, bool PromoteToLDS);
};
// FIXME: This can create globals so should be a module pass.
@@ -197,34 +187,26 @@ class AMDGPUPromoteAlloca : public FunctionPass {
public:
static char ID;
- explicit AMDGPUPromoteAlloca(
- CodeGenOptLevel OptLevel = CodeGenOptLevel::Default)
- : FunctionPass(ID), NoOpt(OptLevel == CodeGenOptLevel::None) {}
+ AMDGPUPromoteAlloca() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override {
if (skipFunction(F))
return false;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
return AMDGPUPromoteAllocaImpl(
TPC->getTM<TargetMachine>(),
getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
- .run(F, /*IsLatePass=*/true, NoOpt);
- }
+ .run(F, /*PromoteToLDS*/ true);
return false;
}
- StringRef getPassName() const override {
- return NoOpt ? "AMDGPU VGPR Allocate" : "AMDGPU Promote Alloca";
- }
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<LoopInfoWrapperPass>();
FunctionPass::getAnalysisUsage(AU);
}
-
-private:
- bool NoOpt;
};
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
@@ -271,8 +253,7 @@ char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
- /*NoOpt=*/false);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -284,8 +265,7 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
PreservedAnalyses
AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/false,
- /*NoOpt=*/false);
+ bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -294,21 +274,8 @@ AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
return PreservedAnalyses::all();
}
-PreservedAnalyses AMDGPUVGPRAllocatePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*IsLatePass=*/true,
- /*NoOpt=*/true);
- if (Changed) {
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
- }
- return PreservedAnalyses::all();
-}
-
-FunctionPass *llvm::createAMDGPUPromoteAlloca(CodeGenOptLevel OptLevel) {
- return new AMDGPUPromoteAlloca(OptLevel);
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+ return new AMDGPUPromoteAlloca();
}
bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
@@ -401,110 +368,9 @@ void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
}
-// A "VGPR as memory" object can only be realized in registers today when every
-// access is a constant-offset, dword-aligned, whole-dword-multiple (32, 64, ...
-// bit) load/store and its address never escapes. Sub-dword accesses, dynamic
-// indexing and escaping addresses are not yet supported; such objects fall back
-// to scratch instead.
-//
-// TODO: Lower dynamically-indexed / escaping VGPR objects so this fallback is no
-// longer needed.
-static bool isVGPRAllocaStaticallyLowerable(const AllocaInst &AI,
- const DataLayout &DL) {
- // An access is lowerable if it covers a whole number of dwords and starts at
- // a dword-aligned constant offset from the alloca.
- auto AccessOK = [&](const Value *Ptr, Type *Ty, bool Simple) {
- if (!Simple)
- return false;
- uint64_t Bits = DL.getTypeStoreSizeInBits(Ty);
- if (Bits == 0 || Bits % 32 != 0)
- return false;
- APInt Off(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
- const Value *Base = Ptr->stripAndAccumulateConstantOffsets(
- DL, Off, /*AllowNonInbounds=*/true);
- return Base == &AI && Off.urem(4) == 0;
- };
-
- SmallVector<const Use *, 16> Worklist;
- for (const Use &U : AI.uses())
- Worklist.push_back(&U);
-
- while (!Worklist.empty()) {
- const Use *U = Worklist.pop_back_val();
- const User *Usr = U->getUser();
-
- if (const auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
- if (!GEP->hasAllConstantIndices())
- return false;
- for (const Use &GU : GEP->uses())
- Worklist.push_back(&GU);
- continue;
- }
- if (const auto *LI = dyn_cast<LoadInst>(Usr)) {
- if (!AccessOK(LI->getPointerOperand(), LI->getType(), LI->isSimple()))
- return false;
- continue;
- }
- if (const auto *SI = dyn_cast<StoreInst>(Usr)) {
- // The pointer must be the address operand, not a stored value (escape).
- if (U->getOperandNo() != StoreInst::getPointerOperandIndex())
- return false;
- if (!AccessOK(SI->getPointerOperand(), SI->getValueOperand()->getType(),
- SI->isSimple()))
- return false;
- continue;
- }
- // Anything else (calls, ptrtoint, address-space casts, ...) escapes or is
- // otherwise not statically lowerable.
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
+ if (DisablePromoteAllocaToLDS && DisablePromoteAllocaToVector)
return false;
- }
- return true;
-}
-
-// Repoint every (transitive) pointer use of \p Old (an addrspace(13) value) at
-// \p New (an addrspace(5) value), so a non-lowerable "VGPR as memory" object
-// falls back to ordinary scratch.
-static void rewriteVGPRPointerToScratch(Value *Old, Value *New) {
- SmallVector<Use *, 16> Uses(make_pointer_range(Old->uses()));
- for (Use *U : Uses) {
- User *Usr = U->getUser();
- if (auto *GEP = dyn_cast<GetElementPtrInst>(Usr)) {
- IRBuilder<> B(GEP);
- SmallVector<Value *, 4> Indices(GEP->indices());
- Value *NewGEP = B.CreateGEP(GEP->getSourceElementType(), New, Indices,
- GEP->getName(), GEP->getNoWrapFlags());
- rewriteVGPRPointerToScratch(GEP, NewGEP);
- GEP->eraseFromParent();
- continue;
- }
- if (auto *II = dyn_cast<IntrinsicInst>(Usr);
- II && II->isLifetimeStartOrEnd()) {
- II->eraseFromParent();
- continue;
- }
- // Loads, stores, address-space casts and call arguments only need this
- // operand repointed; their result types do not depend on the operand's
- // address space.
- U->set(New);
- }
-}
-
-static void demoteVGPRAllocaToScratch(AllocaInst *AI) {
- auto *NewAI = new AllocaInst(
- AI->getAllocatedType(), AMDGPUAS::PRIVATE_ADDRESS, AI->getArraySize(),
- AI->getAlign(), AI->getName(), AI->getIterator());
- NewAI->setDebugLoc(AI->getDebugLoc());
- rewriteVGPRPointerToScratch(AI, NewAI);
- AI->eraseFromParent();
-}
-
-bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
- assert((!NoOpt || IsLatePass) && "NoOpt only makes sense for the late pass");
- if (!IsLatePass && DisablePromoteAllocaToVector)
- return false;
-
- bool PromoteToLDS = IsLatePass && !DisablePromoteAllocaToLDS && !NoOpt;
- bool PromoteToVector = !DisablePromoteAllocaToVector && !NoOpt;
Mod = F.getParent();
DL = &Mod->getDataLayout();
@@ -513,12 +379,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
setFunctionLimits(F);
- // "VGPR as memory" is only enabled on the gfx940/gfx950 (CDNA3+) parts and on
- // GFX12 and later. On any other target the objects fall back to scratch.
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- const bool TargetSupportsVGPRAsMemory =
- ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX12;
-
unsigned VectorizationBudget =
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32)) /
@@ -535,18 +395,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
AllocaAnalysis AA{AI};
- if (AI->getAddressSpace() == AMDGPUAS::VGPR) {
- // Allocas that already live in the VGPR address space only need to be
- // assigned VGPRs, which is required for functionality.
- if (IsLatePass)
- Allocas.push_back(std::move(AA));
- continue;
- }
- if (!PromoteToVector && !PromoteToLDS)
- continue;
if (collectAllocaUses(AA)) {
- if (PromoteToVector)
- analyzePromoteToVector(AA);
+ analyzePromoteToVector(AA);
if (PromoteToLDS)
analyzePromoteToLDS(AA);
if (AA.Vector.Ty || AA.LDS.Enable) {
@@ -557,15 +407,8 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
}
}
- stable_sort(Allocas, [](const auto &A, const auto &B) {
- // Prioritize pre-existing VGPR allocas, since their allocation must not
- // fail.
- bool AIsVGPR = A.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
- bool BIsVGPR = B.Alloca->getAddressSpace() == AMDGPUAS::VGPR;
- if (AIsVGPR != BIsVGPR)
- return AIsVGPR;
- return A.Score > B.Score;
- });
+ stable_sort(Allocas,
+ [](const auto &A, const auto &B) { return A.Score > B.Score; });
// clang-format off
LLVM_DEBUG(
@@ -578,39 +421,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
bool Changed = false;
SetVector<IntrinsicInst *> DeferredIntrs;
for (AllocaAnalysis &AA : Allocas) {
- if (AA.Alloca->getAddressSpace() == AMDGPUAS::VGPR) {
- // Fall back to scratch (and warn) when the object can't be kept in
- // registers, so the program still compiles correctly: either the target
- // does not support "VGPR as memory", or the access pattern (dynamic
- // index, sub-dword, escaping address) is not yet supported.
- const char *Unsupported = nullptr;
- if (!TargetSupportsVGPRAsMemory)
- Unsupported = "not supported on this target";
- else if (!isVGPRAllocaStaticallyLowerable(*AA.Alloca, *DL))
- Unsupported = "dynamic indexing, sub-dword access, or escaping address "
- "is not yet supported";
- if (Unsupported) {
- F.getContext().diagnose(DiagnosticInfoUnsupported(
- F,
- Twine("'amdgpu_vgpr' object could not be kept in vector registers "
- "(") +
- Unsupported + "); using scratch memory instead",
- AA.Alloca->getDebugLoc(), DS_Warning));
- demoteVGPRAllocaToScratch(AA.Alloca);
- Changed = true;
- continue;
- }
- const unsigned AllocaCost =
- AA.Alloca->getAllocationSize(*DL)->getFixedValue() * 8;
- allocateVgprs(AA);
- // Account for the consumed VGPRs in the vectorization budget.
- if (VectorizationBudget > AllocaCost)
- VectorizationBudget -= AllocaCost;
- else
- VectorizationBudget = 0;
- Changed = true;
- continue;
- }
if (AA.Vector.Ty) {
std::optional<TypeSize> Size = AA.Alloca->getAllocationSize(*DL);
assert(Size); // Expected to succeed on non-array alloca.
@@ -645,21 +455,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool IsLatePass, bool NoOpt) {
return Changed;
}
-void AMDGPUPromoteAllocaImpl::allocateVgprs(AllocaAnalysis &AA) {
- LLVMContext &Ctx = Mod->getContext();
- const unsigned AllocaSize =
- DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()) / 8;
-
- // Record where the object was allocated within the VGPR file.
- Type *I32 = Type::getInt32Ty(Ctx);
- AA.Alloca->setMetadata(
- "amdgpu.allocated.vgprs",
- MDNode::get(
- Ctx, {ConstantAsMetadata::get(ConstantInt::get(I32, AllocVGPROffset)),
- ConstantAsMetadata::get(ConstantInt::get(I32, AllocaSize))}));
- AllocVGPROffset += alignTo(AllocaSize, 4);
-}
-
// Checks if the instruction I is a memset user of the alloca AI that we can
// deal with. Currently, only non-volatile memsets that affect the whole alloca
// are handled.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index aab43f23cf606..fc97c33a123f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -179,9 +179,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
// If there are no calls, MachineRegisterInfo can tell us the used register
// count easily.
// A tail call isn't considered a call for MachineFrameInfo's purposes.
+ // The "VGPR as memory" file occupies reserved physical VGPRs. They are not
+ // counted as "used" registers, but they must still be allocated for the
+ // function, so the VGPR count has to cover the highest one.
+ std::pair<unsigned, unsigned> VGPRMemFile = TRI.getVGPRMemoryFile(MF);
+ unsigned VGPRMemBase = VGPRMemFile.first;
+ unsigned VGPRMemCount = VGPRMemFile.second;
+ auto AccountForVGPRMemoryFile = [&](int32_t NumVGPR) -> int32_t {
+ if (VGPRMemCount)
+ NumVGPR = std::max<int32_t>(NumVGPR, VGPRMemBase + VGPRMemCount);
+ return NumVGPR;
+ };
+
if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass,
/*IncludeCalls=*/false);
+ Info.NumVGPR = AccountForVGPRMemoryFile(Info.NumVGPR);
return Info;
}
@@ -319,7 +332,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage(
}
}
- Info.NumVGPR = MaxVGPR + 1;
+ Info.NumVGPR = AccountForVGPRMemoryFile(MaxVGPR + 1);
return Info;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5814862a514b9..6a2b8ffa25c50 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -669,6 +669,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
initializeAMDGPUPrivateObjectVGPRsPass(*PR);
+ initializeAMDGPULowerModuleVGPRsPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
@@ -1492,6 +1493,11 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}
+ // Lay out "VGPR as memory" (addrspace(13)) globals into one shared register
+ // file and record the size/base on the participating functions, so the file
+ // resolves to the same physical registers across a kernel's call graph.
+ addPass(createAMDGPULowerModuleVGPRsPass());
+
// Run atomic optimizer before Atomic Expand
if ((TM.getTargetTriple().isAMDGCN()) &&
(TM.getOptLevel() >= CodeGenOptLevel::Less) &&
@@ -1501,12 +1507,9 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAtomicExpandLegacyPass());
- // With optimizations enabled, do the full promotion of allocas. Without
- // optimizations, this only allocates pre-existing VGPR address space allocas,
- // which is required for functionality.
- addPass(createAMDGPUPromoteAlloca(TM.getOptLevel()));
-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ addPass(createAMDGPUPromoteAlloca());
+
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
@@ -1721,11 +1724,6 @@ void GCNPassConfig::addFastRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
- // Lower "VGPR as memory" accesses to register copies once out of SSA form.
- // At O0 there is no register coalescer; anchor on TwoAddress, where
- // LiveIntervals is already available.
- insertPass(&TwoAddressInstructionPassID, &AMDGPUPrivateObjectVGPRsID);
-
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
TargetPassConfig::addFastRegAlloc();
@@ -1752,12 +1750,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
- // Lower "VGPR as memory" accesses to register copies once out of SSA form.
- // This runs after the coalescer so it does not perturb the kill flags that
- // earlier passes (and -stop-after=twoaddr based tests) rely on, and updates
- // the LiveIntervals the register allocator consumes next.
- insertPass(&RegisterCoalescerID, &AMDGPUPrivateObjectVGPRsID);
-
if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
@@ -1909,6 +1901,12 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
}
void GCNPassConfig::addPostRegAlloc() {
+ // Lower "VGPR as memory" accesses into copies to/from a reserved block of
+ // VGPRs placed just above the registers allocated for the rest of the
+ // function. This runs after register allocation so the used-register count is
+ // final, and before memory-aware post-RA passes so the access pseudos are no
+ // longer seen as memory operations.
+ addPass(&AMDGPUPrivateObjectVGPRsID);
addPass(&SIFixVGPRCopiesID);
if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIOptimizeExecMaskingLegacyID);
@@ -2290,6 +2288,10 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
if (EnableLowerModuleLDS)
addModulePass(AMDGPULowerModuleLDSPass(TM), PMW);
+ // Lay out "VGPR as memory" (addrspace(13)) globals into a shared register
+ // file (see the legacy pipeline above for details).
+ addModulePass(AMDGPULowerModuleVGPRsPass(), PMW);
+
// Run atomic optimizer before Atomic Expand
if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
@@ -2298,15 +2300,8 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(PassManagerWrapper &PMW) const {
addFunctionPass(AtomicExpandPass(TM), PMW);
- // With optimizations enabled, do the full promotion of allocas. Without
- // optimizations, only allocate pre-existing VGPR address space allocas, which
- // is required for functionality.
- if (TM.getOptLevel() > CodeGenOptLevel::None)
- addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
- else
- addFunctionPass(AMDGPUVGPRAllocatePass(TM), PMW);
-
if (TM.getOptLevel() > CodeGenOptLevel::None) {
+ addFunctionPass(AMDGPUPromoteAllocaPass(TM), PMW);
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses(PMW);
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index dd25ab71997d7..3ca9f5bcc9f9d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp
+ AMDGPULowerModuleVGPRs.cpp
AMDGPUPrepareAGPRAlloc.cpp
AMDGPULowerExecSync.cpp
AMDGPUSwLowerLDS.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index db2301ba28359..86b8c2710e4bd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4227,6 +4227,22 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return true;
}
+bool SITargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+ // GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses, so
+ // fall back to SelectionDAG (which does) for any instruction that produces or
+ // consumes such a pointer. TODO: implement the GlobalISel path.
+ auto IsVGPRPtr = [](const Value *V) {
+ Type *Ty = V->getType();
+ return Ty->isPointerTy() && Ty->getPointerAddressSpace() == AMDGPUAS::VGPR;
+ };
+ if (IsVGPRPtr(&Inst))
+ return true;
+ for (const Value *Op : Inst.operands())
+ if (IsVGPRPtr(Op))
+ return true;
+ return false;
+}
+
namespace {
// Chain calls have special arguments that we need to handle. These are
// tagging along at the end of the arguments list(s), after the SGPR and VGPR
@@ -5228,11 +5244,16 @@ emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
Register CondReg = MRI.createVirtualRegister(BoolRC);
- BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
- .addReg(InitReg)
- .addMBB(&OrigBB)
- .addReg(ResultReg)
- .addMBB(&LoopBB);
+ // A zero PhiReg means the caller threads no per-iteration result value
+ // through the loop (e.g. a store whose destination is a fixed physical
+ // register), so the result PHI - and its requirement that ResultReg be
+ // live-out of the loop - is omitted.
+ if (PhiReg)
+ BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
+ .addReg(InitReg)
+ .addMBB(&OrigBB)
+ .addReg(ResultReg)
+ .addMBB(&LoopBB);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
.addReg(InitSaveExecReg)
@@ -5594,6 +5615,153 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+// Expand a "VGPR as memory" access at a runtime dword index into an indirect
+// read/write of the reserved VGPR file, reusing the same movrel / s_set_gpr_idx
+// machinery (and waterfall loop for a divergent index) as indirect vector
+// element access. The file's physical register block stands in for the
+// "vector".
+static MachineBasicBlock *emitVGPRFrameDynamic(MachineInstr &MI,
+ MachineBasicBlock &MBB,
+ const GCNSubtarget &ST) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const bool IsLoad = MI.getOpcode() == AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32;
+
+ // The reserved file block stands in for the indirectly-indexed "vector".
+ auto [BaseIdx, Count] = TRI.getVGPRMemoryFile(*MF);
+ assert(Count && "dynamic VGPR-memory access without a reserved file");
+ const TargetRegisterClass *VecRC = TRI.getVGPRClassForBitWidth(Count * 32);
+ if (!VecRC)
+ report_fatal_error("VGPR-as-memory file too large for a dynamic index");
+ MCRegister FileReg = TRI.getMatchingSuperReg(
+ AMDGPU::VGPR_32RegClass.getRegister(BaseIdx), AMDGPU::sub0, VecRC);
+ // movrel reads name the base sub-register directly (a subregister index is
+ // not allowed on a physical-register operand), with the whole file tuple as
+ // an implicit use.
+ MCRegister FileBaseReg = AMDGPU::VGPR_32RegClass.getRegister(BaseIdx);
+
+ const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
+ const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
+ const bool UseGPRIdxMode = ST.useVGPRIndexMode();
+
+ // The index is file-relative (the constant part was folded in during ISel),
+ // so the sub-register base is sub0 and the extra offset is zero.
+ unsigned SubReg = AMDGPU::sub0;
+ int Offset = 0;
+
+ MachineBasicBlock::iterator I(&MI);
+
+ // Uniform (scalar) index: emit the access directly.
+ if (TRI.isSGPRClass(IdxRC)) {
+ if (IsLoad) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (UseGPRIdxMode) {
+ Register IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+ BuildMI(
+ MBB, I, DL,
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true),
+ Dst)
+ .addReg(FileReg)
+ .addReg(IdxReg)
+ .addImm(SubReg);
+ } else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(FileBaseReg)
+ .addReg(FileReg, RegState::Implicit);
+ }
+ } else {
+ const MachineOperand *Val =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (UseGPRIdxMode) {
+ Register IdxReg = getIndirectSGPRIdx(TII, MRI, MI, Offset);
+ BuildMI(
+ MBB, I, DL,
+ TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false),
+ FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addReg(IdxReg)
+ .addImm(SubReg);
+ } else {
+ setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
+ BuildMI(MBB, I, DL,
+ TII->getIndirectRegWriteMovRelPseudo(
+ TRI.getRegSizeInBits(*VecRC), 32, false),
+ FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addImm(SubReg);
+ }
+ }
+ MI.eraseFromParent();
+ return &MBB;
+ }
+
+ // Divergent (per-lane) index: a waterfall loop activates the lanes that share
+ // the just-read index, performs the access for them against the file, and
+ // repeats until every lane is covered. The file lives in fixed (reserved)
+ // physical registers, so unlike indirect vector access it is not threaded
+ // through a PHI; the per-lane access reads/writes it in place under EXEC.
+ // The stored value is re-used on every loop iteration, so it must stay live
+ // across the back-edge.
+ if (!IsLoad)
+ MRI.clearKillFlags(
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg());
+
+ // A load threads the loaded element as the loop result; a store writes the
+ // file in place and threads nothing (PhiReg == 0 skips the result PHI).
+ Register PhiReg, InitReg;
+ if (IsLoad) {
+ PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
+ }
+
+ Register SGPRIdxReg;
+ auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
+ UseGPRIdxMode, SGPRIdxReg);
+ MachineBasicBlock *LoopBB = InsPt->getParent();
+ unsigned VecBits = TRI.getRegSizeInBits(*VecRC);
+
+ if (IsLoad) {
+ Register Dst = MI.getOperand(0).getReg();
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, true),
+ Dst)
+ .addReg(FileReg)
+ .addReg(SGPRIdxReg)
+ .addImm(SubReg);
+ } else {
+ BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
+ .addReg(FileBaseReg)
+ .addReg(FileReg, RegState::Implicit);
+ }
+ } else {
+ const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+ if (UseGPRIdxMode) {
+ BuildMI(*LoopBB, InsPt, DL, TII->getIndirectGPRIDXPseudo(VecBits, false),
+ FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addReg(SGPRIdxReg)
+ .addImm(SubReg);
+ } else {
+ BuildMI(*LoopBB, InsPt, DL,
+ TII->getIndirectRegWriteMovRelPseudo(VecBits, 32, false), FileReg)
+ .addReg(FileReg)
+ .add(*Val)
+ .addImm(SubReg);
+ }
+ }
+
+ MI.eraseFromParent();
+ return LoopBB;
+}
+
static MachineBasicBlock *expand64BitScalarArithmetic(MachineInstr &MI,
MachineBasicBlock *BB) {
// For targets older than GFX12, we emit a sequence of 32-bit operations.
@@ -6457,7 +6625,7 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
else
ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
}
- ClampInstr.addReg(Src0); // src0
+ ClampInstr.addReg(Src0); // src0
if (isFPOp)
ClampInstr.addImm(SISrcMods::NONE); // src1 mod
ClampInstr.addReg(Src1); // src1
@@ -7121,6 +7289,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
case AMDGPU::SI_INDIRECT_DST_V16:
case AMDGPU::SI_INDIRECT_DST_V32:
return emitIndirectDst(MI, *BB, *getSubtarget());
+ case AMDGPU::SI_VGPR_FRAME_DYN_LOAD_B32:
+ case AMDGPU::SI_VGPR_FRAME_DYN_STORE_B32:
+ return emitVGPRFrameDynamic(MI, *BB, *getSubtarget());
case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
case AMDGPU::SI_KILL_I1_PSEUDO:
return splitKillBlock(MI, BB);
@@ -9931,6 +10102,23 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
EVT PtrVT = Op.getValueType();
const GlobalValue *GV = GSD->getGlobal();
+
+ // A "VGPR as memory" (addrspace(13)) global has no numeric memory address;
+ // its "address" is the byte offset of the object within the reserved register
+ // file (assigned by AMDGPULowerModuleVGPRs and recorded as metadata). Lower
+ // it to that constant offset so that, even when the address is materialized
+ // standalone (e.g. from a constant-expression GEP), it never reaches the
+ // pc-relative global-address sequence. SIISelLowering's REG_{LOAD,STORE}
+ // folding then turns the access into a register copy/indexed move.
+ if (GSD->getAddressSpace() == AMDGPUAS::VGPR) {
+ uint64_t Offset = GSD->getOffset();
+ if (const auto *GVar = dyn_cast<GlobalVariable>(GV))
+ if (MDNode *MD = GVar->getMetadata("amdgpu.vgpr.memory.offset"))
+ Offset +=
+ mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
+ return DAG.getConstant(Offset, DL, PtrVT);
+ }
+
if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
shouldUseLDSConstAddress(GV)) ||
GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
@@ -12432,18 +12620,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case 12:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
- Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
- : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
break;
case 16:
if (!Subtarget->hasLDSLoadB96_B128())
return SDValue();
- Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
- : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
- : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
+ Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
+ : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
+ : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
break;
}
@@ -12473,11 +12661,11 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
? 1
: 0,
- DL, MVT::i8)); // swz
+ DL, MVT::i8)); // swz
Ops.push_back(
DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
- Ops.push_back(M0Val.getValue(0)); // Chain
- Ops.push_back(M0Val.getValue(1)); // Glue
+ Ops.push_back(M0Val.getValue(0)); // Chain
+ Ops.push_back(M0Val.getValue(1)); // Glue
auto *M = cast<MemSDNode>(Op);
auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
@@ -12555,7 +12743,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Ops.push_back(VOffset);
}
- Ops.push_back(Op.getOperand(5)); // Offset
+ Ops.push_back(Op.getOperand(5)); // Offset
unsigned Aux = Op.getConstantOperandVal(6);
Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
@@ -14330,48 +14518,209 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
}
-/// Lower a load/store of a "VGPR as memory" object (an alloca in
-/// AMDGPUAS::VGPR) into an AMDGPUISD::REG_{LOAD,STORE} node carrying the
-/// constant dword index of the access within the per-function VGPR file. These
-/// nodes are selected into register copies via the SI_VGPR_FRAME_* pseudos and
-/// the AMDGPUPrivateObjectVGPRs pass.
+/// Lower a load/store of a "VGPR as memory" object (a global in AMDGPUAS::VGPR)
+/// into an AMDGPUISD::REG_{LOAD,STORE} node carrying the dword index of the
+/// access within the reserved VGPR file. A constant index selects the
+/// SI_VGPR_FRAME_* pseudos (rewritten to register copies by
+/// AMDGPUPrivateObjectVGPRs); a runtime index selects the SI_VGPR_FRAME_DYN_*
+/// pseudos (expanded to an indexed register move). Sub-dword (i8/i16) accesses
+/// are realized as a read-modify-write of the containing dword.
///
-/// Returns SDValue() if the access cannot (yet) be resolved to a constant file
-/// offset; such objects are demoted to scratch by AMDGPUPromoteAlloca, so any
-/// access that survives to here is expected to fold to a constant offset.
+/// Returns SDValue() for an access this routine does not (yet) handle (e.g. a
+/// wider-than-dword dynamic access), leaving it for the caller.
SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
SelectionDAG &DAG) const {
MemSDNode *MemOp = cast<MemSDNode>(Op);
- const MachineFunction &MF = DAG.getMachineFunction();
SDLoc DL(Op);
- // Resolve the constant byte offset of the access within the VGPR file
- // directly from the frame index (plus a constant GEP offset); the frame index
- // itself is not custom-lowered.
+ // The "VGPR as memory" pointer value is the byte offset of the access within
+ // the reserved register file. After stripping a folded GEP offset, the base
+ // is one of: the addrspace(13) global itself (its offset is recorded as
+ // metadata) before LowerGlobalAddress runs; the constant that
+ // LowerGlobalAddress folds that global to; or a runtime value (dynamic
+ // index).
SDValue Ptr = MemOp->getBasePtr();
unsigned ExtraOffset = 0;
+ SDValue DynByteOffset; // non-constant byte offset, for a runtime index
if (Ptr.getOpcode() == ISD::ADD || Ptr.getOpcode() == ISD::PTRADD) {
- auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1));
- if (!C)
- return SDValue();
- ExtraOffset = C->getZExtValue();
+ if (auto *C = dyn_cast<ConstantSDNode>(Ptr.getOperand(1)))
+ ExtraOffset = C->getZExtValue();
+ else
+ DynByteOffset = Ptr.getOperand(1);
Ptr = Ptr.getOperand(0);
}
- auto *FI = dyn_cast<FrameIndexSDNode>(Ptr);
- if (!FI)
- return SDValue();
- const AllocaInst *AI = MF.getFrameInfo().getObjectAllocation(FI->getIndex());
- if (!AI || AI->getAddressSpace() != AMDGPUAS::VGPR)
- return SDValue();
- unsigned ByteOffset =
- AMDGPU::AllocatedVGPRsMetadata::get(*AI).Address + ExtraOffset;
- if (ByteOffset % 4 != 0)
- return SDValue();
+ unsigned ByteOffset = ExtraOffset;
+ if (auto *GA = dyn_cast<GlobalAddressSDNode>(Ptr)) {
+ if (GA->getAddressSpace() != AMDGPUAS::VGPR)
+ return SDValue();
+ const auto *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
+ if (!GV)
+ return SDValue();
+ MDNode *MD = GV->getMetadata("amdgpu.vgpr.memory.offset");
+ if (!MD)
+ return SDValue();
+ ByteOffset +=
+ mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue() +
+ GA->getOffset();
+ } else if (auto *C = dyn_cast<ConstantSDNode>(Ptr)) {
+ ByteOffset += C->getZExtValue();
+ } else {
+ // The base is itself a runtime byte offset.
+ if (DynByteOffset)
+ return SDValue(); // two independent dynamic terms; unsupported
+ DynByteOffset = Ptr;
+ }
EVT MemVT = MemOp->getMemoryVT();
unsigned BitWidth = MemVT.getSizeInBits();
- // Only whole-dword accesses are kept in registers; sub-dword and
- // non-dword-multiple objects are demoted to scratch by AMDGPUPromoteAlloca.
+ MachineFunction &MFn = DAG.getMachineFunction();
+ SDValue Chain = MemOp->getChain();
+
+ auto GetDwordMMO = [&](MachineMemOperand::Flags F) {
+ return MFn.getMachineMemOperand(MemOp->getPointerInfo(), F, /*Size=*/4,
+ Align(4));
+ };
+
+ // Runtime (non-constant) index. The constant part of the address is folded in
+ // and divided by 4 to give the dword index, which the REG_{LOAD,STORE} node
+ // carries as a register. Sub-dword (8/16-bit) accesses extract from / read-
+ // modify-write the containing dword, with the bit position computed at
+ // runtime (this is race-free because vector registers are per-lane storage).
+ if (DynByteOffset) {
+ if (BitWidth != 8 && BitWidth != 16 && BitWidth != 32)
+ return SDValue();
+ SDValue DynI32 = DAG.getZExtOrTrunc(DynByteOffset, DL, MVT::i32);
+ SDValue Bytes = DAG.getNode(ISD::ADD, DL, MVT::i32, DynI32,
+ DAG.getConstant(ByteOffset, DL, MVT::i32));
+ SDValue Index = DAG.getNode(ISD::SRL, DL, MVT::i32, Bytes,
+ DAG.getConstant(2, DL, MVT::i32));
+
+ auto LoadDword = [&]() {
+ SDValue Ld = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+ Chain = Ld.getValue(1);
+ return Ld;
+ };
+
+ if (BitWidth == 8 || BitWidth == 16) {
+ // Bit position of the field within its dword, computed at runtime. An
+ // aligned i8/i16 access never crosses a dword boundary (the file is
+ // dword- aligned and the element-scaled offset keeps the field inside one
+ // dword).
+ SDValue ByteInDword = DAG.getNode(ISD::AND, DL, MVT::i32, Bytes,
+ DAG.getConstant(3, DL, MVT::i32));
+ SDValue BitInDword = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteInDword,
+ DAG.getConstant(3, DL, MVT::i32));
+ uint32_t LowMask = maskTrailingOnes<uint32_t>(BitWidth);
+ SDValue LowMaskC = DAG.getConstant(LowMask, DL, MVT::i32);
+
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Old = LoadDword();
+ SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+ Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val, LowMaskC);
+ Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val, BitInDword);
+ SDValue MaskShifted =
+ DAG.getNode(ISD::SHL, DL, MVT::i32, LowMaskC, BitInDword);
+ SDValue Cleared = DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+ DAG.getNOT(DL, MaskShifted, MVT::i32));
+ SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+ DAG.getVTList(MVT::Other),
+ {Chain, New, Index}, MVT::i32,
+ GetDwordMMO(MachineMemOperand::MOStore));
+ }
+
+ auto *LoadOp = cast<LoadSDNode>(MemOp);
+ SDValue Field =
+ DAG.getNode(ISD::SRL, DL, MVT::i32, LoadDword(), BitInDword);
+ EVT ResVT = LoadOp->getValueType(0);
+ if (LoadOp->getExtensionType() == ISD::SEXTLOAD)
+ Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+ DAG.getValueType(MemVT));
+ else
+ Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field, LowMaskC);
+ SDValue Result = ResVT == MVT::i32
+ ? Field
+ : DAG.getNode(ISD::TRUNCATE, DL, ResVT, Field);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
+
+ // Whole 32-bit dynamic access.
+ if (ByteOffset % 4 != 0)
+ return SDValue();
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Val = DAG.getBitcast(MVT::i32, StoreOp->getValue());
+ return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+ DAG.getVTList(MVT::Other),
+ {Chain, Val, Index}, MVT::i32,
+ GetDwordMMO(MachineMemOperand::MOStore));
+ }
+ auto *LoadOp = cast<LoadSDNode>(MemOp);
+ if (LoadOp->getExtensionType() != ISD::NON_EXTLOAD)
+ return SDValue();
+ SDValue Ld = LoadDword();
+ EVT ResVT = LoadOp->getValueType(0);
+ SDValue Res = ResVT == MVT::i32 ? Ld : DAG.getBitcast(ResVT, Ld);
+ return DAG.getMergeValues({Res, Chain}, DL);
+ }
+
+ // Sub-dword (8/16-bit) constant-index access. Registers have no sub-dword
+ // addressing, so the field is extracted from (loads) or inserted into (stores
+ // via read-modify-write) the dword that contains it, using shifts and masks.
+ if (BitWidth == 8 || BitWidth == 16) {
+ unsigned BitInDword = (ByteOffset % 4) * 8;
+ if (BitInDword + BitWidth > 32)
+ return SDValue(); // field crosses a dword boundary; unsupported
+ SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
+ uint32_t LowMask = maskTrailingOnes<uint32_t>(BitWidth);
+
+ if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
+ SDValue Old = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+ Chain = Old.getValue(1);
+ SDValue Val = DAG.getZExtOrTrunc(StoreOp->getValue(), DL, MVT::i32);
+ Val = DAG.getNode(ISD::AND, DL, MVT::i32, Val,
+ DAG.getConstant(LowMask, DL, MVT::i32));
+ if (BitInDword)
+ Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Val,
+ DAG.getConstant(BitInDword, DL, MVT::i32));
+ SDValue Cleared =
+ DAG.getNode(ISD::AND, DL, MVT::i32, Old,
+ DAG.getConstant(~(LowMask << BitInDword), DL, MVT::i32));
+ SDValue New = DAG.getNode(ISD::OR, DL, MVT::i32, Cleared, Val);
+ return DAG.getMemIntrinsicNode(AMDGPUISD::REG_STORE, DL,
+ DAG.getVTList(MVT::Other),
+ {Chain, New, Index}, MVT::i32,
+ GetDwordMMO(MachineMemOperand::MOStore));
+ }
+
+ auto *LoadOp = cast<LoadSDNode>(MemOp);
+ SDValue Dword = DAG.getMemIntrinsicNode(
+ AMDGPUISD::REG_LOAD, DL, DAG.getVTList(MVT::i32, MVT::Other),
+ {Chain, Index}, MVT::i32, GetDwordMMO(MachineMemOperand::MOLoad));
+ Chain = Dword.getValue(1);
+ SDValue Field = Dword;
+ if (BitInDword)
+ Field = DAG.getNode(ISD::SRL, DL, MVT::i32, Field,
+ DAG.getConstant(BitInDword, DL, MVT::i32));
+ EVT ResVT = LoadOp->getValueType(0);
+ if (LoadOp->getExtensionType() == ISD::SEXTLOAD)
+ Field = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Field,
+ DAG.getValueType(MemVT));
+ else
+ Field = DAG.getNode(ISD::AND, DL, MVT::i32, Field,
+ DAG.getConstant(LowMask, DL, MVT::i32));
+ SDValue Result = ResVT == MVT::i32
+ ? Field
+ : DAG.getNode(ISD::TRUNCATE, DL, ResVT, Field);
+ return DAG.getMergeValues({Result, Chain}, DL);
+ }
+
+ // Whole-dword accesses.
+ if (ByteOffset % 4 != 0)
+ return SDValue();
if (BitWidth == 0 || BitWidth % 32 != 0)
return SDValue();
if (!Subtarget->getRegisterInfo()->getVGPRClassForBitWidth(BitWidth))
@@ -14397,7 +14746,6 @@ SDValue SITargetLowering::LowerLoadStoreVGPR(SDValue Op,
}
SDValue Index = DAG.getConstant(ByteOffset / 4, DL, MVT::i32);
- SDValue Chain = MemOp->getChain();
if (auto *StoreOp = dyn_cast<StoreSDNode>(MemOp)) {
SDValue Value = StoreOp->getValue();
if (RegVT != MemVT)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 37f3bb37d1aef..aa1b11e3c4c68 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -34,17 +34,16 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const GCNSubtarget *Subtarget;
public:
- MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
+ MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const override;
- unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
- CallingConv::ID CC, EVT VT,
- EVT &IntermediateVT,
- unsigned &NumIntermediates,
- MVT &RegisterVT) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(
+ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+ unsigned &NumIntermediates, MVT &RegisterVT) const override;
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const;
@@ -73,7 +72,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV,
AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const;
- SDValue getPreloadedValue(SelectionDAG &DAG, const SIMachineFunctionInfo &MFI,
+ SDValue getPreloadedValue(SelectionDAG &DAG,
+ const SIMachineFunctionInfo &MFI,
EVT VT,
AMDGPUFunctionArgInfo::PreloadedValue) const;
@@ -81,8 +81,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SelectionDAG &DAG) const override;
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op, MVT VT,
- unsigned Offset) const;
+ SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
+ MVT VT, unsigned Offset) const;
SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
SelectionDAG &DAG, bool WithChain) const;
SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
@@ -134,8 +134,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
- SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M, SelectionDAG &DAG,
- ArrayRef<SDValue> Ops,
+ SDValue adjustLoadValueType(unsigned Opcode, MemSDNode *M,
+ SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
@@ -152,12 +152,14 @@ class SITargetLowering final : public AMDGPUTargetLowering {
/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.
- SDValue getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL,
+ SDValue getFPExtOrFPRound(SelectionDAG &DAG,
+ SDValue Op,
+ const SDLoc &DL,
EVT VT) const;
- SDValue convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
- SDValue Val, bool Signed,
- const ISD::InputArg *Arg = nullptr) const;
+ SDValue convertArgType(
+ SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
+ bool Signed, const ISD::InputArg *Arg = nullptr) const;
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
@@ -193,10 +195,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
- SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performUCharToFloatCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const;
SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT,
+ SDValue performSHLPtrCombine(SDNode *N,
+ unsigned AS,
+ EVT MemVT,
DAGCombinerInfo &DCI) const;
SDValue performMemSDNodeCombine(MemSDNode *N, DAGCombinerInfo &DCI) const;
@@ -230,8 +235,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
- unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0,
- const SDNode *N1) const;
+ unsigned getFusedOpcode(const SelectionDAG &DAG,
+ const SDNode *N0, const SDNode *N1) const;
SDValue tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue foldAddSub64WithZeroLowBitsTo32(SDNode *N,
DAGCombinerInfo &DCI) const;
@@ -394,7 +399,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
getPreferredVectorAction(MVT VT) const override;
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
- Type *Ty) const override;
+ Type *Ty) const override;
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const override;
@@ -413,8 +418,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool supportSplitCSR(MachineFunction *MF) const override;
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
- MachineBasicBlock *Entry,
- const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+ MachineBasicBlock *Entry,
+ const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@@ -422,8 +427,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
- bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
- bool isVarArg,
+ bool CanLowerReturn(CallingConv::ID CallConv,
+ MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context, const Type *RetTy) const override;
@@ -432,11 +437,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;
- void
- passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo,
- const SIMachineFunctionInfo &Info,
- SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
- SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const;
+ void passSpecialInputs(
+ CallLoweringInfo &CLI,
+ CCState &CCInfo,
+ const SIMachineFunctionInfo &Info,
+ SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
+ SmallVectorImpl<SDValue> &MemOpChains,
+ SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
@@ -447,11 +454,13 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool mayBeEmittedAsTailCall(const CallInst *) const override;
+ bool fallBackToDAGISel(const Instruction &Inst) const override;
+
bool isEligibleForTailCallOptimization(
- SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
- const SmallVectorImpl<ISD::OutputArg> &Outs,
- const SmallVectorImpl<SDValue> &OutVals,
- const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
+ SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+ const SmallVectorImpl<ISD::OutputArg> &Outs,
+ const SmallVectorImpl<SDValue> &OutVals,
+ const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
@@ -467,7 +476,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const;
- Register getRegisterByName(const char *RegName, LLT VT,
+ Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
@@ -524,7 +533,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint,
uint64_t Val) const;
- bool checkAsmConstraintValA(SDValue Op, uint64_t Val,
+ bool checkAsmConstraintValA(SDValue Op,
+ uint64_t Val,
unsigned MaxSize = 64) const;
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
SDValue V) const;
@@ -535,7 +545,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
- void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known,
+ void computeKnownBitsForFrameIndex(int FrameIdx,
+ KnownBits &Known,
const MachineFunction &MF) const override;
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R,
KnownBits &Known,
@@ -581,7 +592,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
void emitExpandAtomicLoad(LoadInst *LI) const override;
void emitExpandAtomicStore(StoreInst *SI) const override;
- LoadInst *lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
const TargetRegisterClass *getRegClassFor(MVT VT,
bool isDivergent) const override;
@@ -591,7 +603,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
unsigned
getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override;
- void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF,
+ void allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
@@ -606,21 +619,28 @@ class SITargetLowering final : public AMDGPUTargetLowering {
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF,
+ void allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv, bool IsShader) const;
+ CallingConv::ID CallConv,
+ bool IsShader) const;
- void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+ void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) const;
-
- void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF,
+ void allocateSpecialInputSGPRs(
+ CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
- void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF,
+ void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
+ MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 80a42a66b2368..88aa30ff206ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1244,11 +1244,10 @@ def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
} // End Spill = 1, VALU = 1, isConvergent = 1
// "VGPR as memory" pseudo accesses: a load/store of a whole VGPR tuple (one or
-// more dwords) from/to an alloca in the VGPR address space (AMDGPUAS::VGPR), at
-// a constant dword index within the per-function VGPR file. They are selected
-// from AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
-// register copies by the AMDGPUPrivateObjectVGPRs pass before register
-// allocation.
+// more dwords) from/to a global in the VGPR address space (AMDGPUAS::VGPR), at a
+// constant dword index within the reserved VGPR file. They are selected from
+// AMDGPUISD::REG_{LOAD,STORE} (with a constant index) and rewritten into
+// register copies by the AMDGPUPrivateObjectVGPRs pass.
let hasSideEffects = 0 in {
foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
VReg_224, VReg_256, VReg_288, VReg_320, VReg_352, VReg_384,
@@ -1266,6 +1265,25 @@ foreach rc = [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192,
}
} // End hasSideEffects = 0
+// "VGPR as memory" pseudo accesses at a *runtime* dword index. The index is a
+// VS_32 ($idx) that the custom inserter resolves into an indirect read/write
+// against the reserved VGPR file (movrel / s_set_gpr_idx, with a waterfall loop
+// for a divergent index), so unlike the constant pseudos these are expanded in
+// EmitInstrWithCustomInserter rather than by AMDGPUPrivateObjectVGPRs. Currently
+// only 32-bit accesses are handled dynamically.
+let usesCustomInserter = 1, hasSideEffects = 0, UseNamedOperandTable = 1 in {
+ def SI_VGPR_FRAME_DYN_LOAD_B32 : VPseudoInstSI <
+ (outs VGPR_32:$vdst), (ins VS_32:$idx)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ }
+ def SI_VGPR_FRAME_DYN_STORE_B32 : VPseudoInstSI <
+ (outs), (ins VGPR_32:$vdata, VS_32:$idx)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ }
+}
+
// Select AMDGPUISD::REG_{LOAD,STORE} (with a constant dword index) into the
// width-matched frame pseudo.
multiclass VGPRFrameLoadStorePat<ValueType vt> {
@@ -1276,6 +1294,17 @@ multiclass VGPRFrameLoadStorePat<ValueType vt> {
(store_inst $data, imm:$idx)>;
}
+// Select AMDGPUISD::REG_{LOAD,STORE} with a non-constant dword index into the
+// dynamic pseudo. Lower complexity than the constant patterns above, so a
+// constant index still prefers them.
+multiclass VGPRFrameDynLoadStorePat<ValueType vt> {
+ def : GCNPat<(vt (SIreg_load i32:$idx)), (SI_VGPR_FRAME_DYN_LOAD_B32 $idx)>;
+ def : GCNPat<(SIreg_store vt:$data, i32:$idx),
+ (SI_VGPR_FRAME_DYN_STORE_B32 $data, $idx)>;
+}
+foreach vt = Reg32Types.types in
+defm : VGPRFrameDynLoadStorePat<vt>;
+
foreach vt = !listconcat(
Reg32Types.types, Reg64Types.types, Reg96Types.types, Reg128Types.types,
Reg160Types.types, Reg192Types.types, Reg224Types.types, Reg256Types.types,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 4be4ce28e6de5..46687a6b061a7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -183,6 +183,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
"amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
+ // "VGPR as memory" file layout assigned module-wide by
+ // AMDGPULowerModuleVGPRs. When present, the file size is fixed here (offsets
+ // come from per-global metadata) and the base register is shared across the
+ // call graph.
+ if (F.hasFnAttribute("amdgpu-vgpr-memory-size"))
+ VGPRMemorySize =
+ F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-size", 0);
+ if (F.hasFnAttribute("amdgpu-vgpr-memory-base"))
+ VGPRMemoryBase =
+ F.getFnAttributeAsParsedInteger("amdgpu-vgpr-memory-base", ~0u);
+
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
// VGPR available at all times. For now, reserve highest available VGPR. After
// RA, shift it to the lowest available unused VGPR if the one exist.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 1f43505650222..7568608a0b881 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -587,6 +587,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
// the serialization easier.
ReservedRegSet WWMReservedRegs;
+ // "VGPR as memory" (AMDGPUAS::VGPR / addrspace(13)) file, assigned by
+ // AMDGPULowerModuleVGPRs: VGPRMemorySize (bytes) comes from the
+ // "amdgpu-vgpr-memory-size" attribute (per-global offsets come from metadata)
+ // and VGPRMemoryBase is the shared base register index from
+ // "amdgpu-vgpr-memory-base" (~0u means "no module-assigned base; derive it").
+ // The file is reserved out of allocation for the whole function, like LDS,
+ // and accesses are lowered to register copies / indexed moves.
+ unsigned VGPRMemorySize = 0;
+ unsigned VGPRMemoryBase = ~0u;
+
bool IsWholeWaveFunction = false;
using PrologEpilogSGPRSpill =
@@ -690,6 +700,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunctionInfo,
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
+ // "VGPR as memory" (addrspace(13)) file size in bytes (0 if the function has
+ // no such objects) and shared base register index, both assigned module-wide
+ // by AMDGPULowerModuleVGPRs (~0u base means the backend should derive it; see
+ // SIRegisterInfo::getVGPRMemoryFile).
+ unsigned getVGPRMemorySize() const { return VGPRMemorySize; }
+ unsigned getVGPRMemoryBase() const { return VGPRMemoryBase; }
+
bool isWWMReservedRegister(Register Reg) const {
return WWMReservedRegs.contains(Reg);
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 9700720f0373a..0103c2c22e481 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -584,6 +584,51 @@ MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
+std::pair<unsigned, unsigned>
+SIRegisterInfo::getVGPRMemoryFile(const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned Bytes = MFI->getVGPRMemorySize();
+ if (!Bytes)
+ return {0, 0};
+
+ // Round to an even number of dwords so wide (>=64-bit) tuple accesses start
+ // on an aligned register on targets that require aligned VGPR tuples.
+ unsigned Dwords = alignTo(divideCeil(Bytes, 4u), 2u);
+
+ // Compute the lowest base that clears this function's own ABI input registers
+ // (workitem ID for kernels, argument VGPRs for functions). The file sits at
+ // the low end so the register allocator stacks the rest of the function on
+ // top of it and the file costs only its own size in the VGPR count, rather
+ // than pinning occupancy as a high-end placement would.
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ unsigned FirstFree = 0;
+ for (const auto &LI : MRI.liveins()) {
+ MCRegister Reg = LI.first;
+ const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+ if (!RC || !isVGPRClass(RC))
+ continue;
+ unsigned End = getHWRegIndex(Reg) + getRegSizeInBits(*RC) / 32u;
+ FirstFree = std::max(FirstFree, End);
+ }
+ unsigned BaseIdx = alignTo(FirstFree, 2u);
+
+ // Prefer the shared base assigned module-wide by AMDGPULowerModuleVGPRs: it
+ // is the same in every function of the call graph (so an address resolves to
+ // the same physical register everywhere) and is chosen to clear every
+ // participating function's inputs, hence it is at or above the local base.
+ unsigned SharedBase = MFI->getVGPRMemoryBase();
+ if (SharedBase != ~0u) {
+ assert(SharedBase >= BaseIdx &&
+ "shared VGPR-memory base overlaps this function's ABI inputs");
+ BaseIdx = SharedBase;
+ }
+
+ assert(BaseIdx + Dwords <=
+ ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) &&
+ "VGPR-as-memory file does not fit");
+ return {BaseIdx, Dwords};
+}
+
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::MODE);
@@ -747,6 +792,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (Register Reg : MFI->getWWMReservedRegs())
reserveRegisterTuples(Reserved, Reg);
+ // Reserve the registers backing "VGPR as memory" (addrspace(13)) objects.
+ // The file occupies a fixed block of physical VGPRs (at the low end, just
+ // above the function's ABI inputs; see getVGPRMemoryFile) and must not be
+ // allocated for the whole function.
+ auto [VGPRMemBase, VGPRMemCount] = getVGPRMemoryFile(MF);
+ for (unsigned I = 0; I != VGPRMemCount; ++I)
+ reserveRegisterTuples(Reserved,
+ AMDGPU::VGPR_32RegClass.getRegister(VGPRMemBase + I));
+
// FIXME: Stop using reserved registers for this.
for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
reserveRegisterTuples(Reserved, Reg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 5e08e47ad4d83..afe0225bc0c0c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -96,6 +96,15 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
bool isAsmClobberable(const MachineFunction &MF,
MCRegister PhysReg) const override;
+ /// The per-function "VGPR as memory" (addrspace(13)) register file is a fixed
+ /// block of physical VGPRs reserved for the whole function (and, like LDS,
+ /// placed at a location that is consistent across the call graph). Returns
+ /// the VGPR_32 register index of the first file register and the number of
+ /// dword registers it occupies, or {0, 0} if the function has no such
+ /// objects.
+ std::pair<unsigned, unsigned>
+ getVGPRMemoryFile(const MachineFunction &MF) const;
+
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 7528cd2a009a3..96571dd028b14 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -18,7 +18,6 @@
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
@@ -1780,17 +1779,6 @@ bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val) {
return false;
}
-AllocatedVGPRsMetadata AllocatedVGPRsMetadata::get(const AllocaInst &Alloca) {
- const MDNode *MD = Alloca.getMetadata("amdgpu.allocated.vgprs");
- assert(MD && MD->getNumOperands() == 2 &&
- "expected !amdgpu.allocated.vgprs metadata with 2 operands");
- unsigned Address =
- mdconst::extract<ConstantInt>(MD->getOperand(0))->getZExtValue();
- unsigned Size =
- mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
- return {Address, Size};
-}
-
unsigned getVmcntBitMask(const IsaVersion &Version) {
return (1 << (getVmcntBitWidthLo(Version.Major) +
getVmcntBitWidthHi(Version.Major))) -
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index b34dde7cb2cd7..1623dc72d2810 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -30,7 +30,6 @@ struct amd_kernel_code_t;
namespace llvm {
struct Align;
-class AllocaInst;
class Argument;
class Function;
class GlobalValue;
@@ -1033,16 +1032,6 @@ getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size);
/// Checks if \p Val is inside \p MD, a !range-like metadata.
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val);
-/// Decoded form of the \c !amdgpu.allocated.vgprs metadata attached to a
-/// "VGPR as memory" alloca: the byte offset (address) the object was allocated
-/// to within the VGPR file, and its size in bytes.
-struct AllocatedVGPRsMetadata {
- unsigned Address;
- unsigned Size;
-
- static AllocatedVGPRsMetadata get(const AllocaInst &Alloca);
-};
-
// The following methods are only meaningful on targets that support
// S_WAITCNT.
diff --git a/llvm/lib/TargetParser/TargetDataLayout.cpp b/llvm/lib/TargetParser/TargetDataLayout.cpp
index a2125eeb82932..67365cdc38b88 100644
--- a/llvm/lib/TargetParser/TargetDataLayout.cpp
+++ b/llvm/lib/TargetParser/TargetDataLayout.cpp
@@ -273,8 +273,10 @@ static std::string computeAMDDataLayout(const Triple &TT) {
// (address space 7), and 128-bit non-integral buffer resourcees (address
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
+ // Address space 13 ("VGPR as memory") uses 32-bit register-relative indices.
return "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
+ "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-p13:32:32-i64:"
+ "64-"
"v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
deleted file mode 100644
index f6c64c5121867..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-vgpr-allocate-basic.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s -o - | FileCheck %s
-
-define void @vgpr_alloca() {
-; CHECK-LABEL: define void @vgpr_alloca(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[A:%.*]] = alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META0:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- ret void
-}
-
-define void @vgpr_alloca_multiple() {
-; CHECK-LABEL: define void @vgpr_alloca_multiple(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca i32, align 4, addrspace(13), !amdgpu.allocated.vgprs [[META1:![0-9]+]]
-; CHECK-NEXT: [[B:%.*]] = alloca [2 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs [[META2:![0-9]+]]
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[A]], align 4
-; CHECK-NEXT: store i32 0, ptr addrspace(13) [[B]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca i32, align 4, addrspace(13)
- %b = alloca [2 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- store i32 0, ptr addrspace(13) %b
- ret void
-}
-
-define void @private_alloca_unchanged() {
-; CHECK-LABEL: define void @private_alloca_unchanged(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca [4 x i64], align 4, addrspace(5)
-; CHECK-NEXT: store i64 42, ptr addrspace(5) [[A]], align 8
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i64], align 4, addrspace(5)
- store i64 42, ptr addrspace(5) %a
- ret void
-}
-
-declare void @use(ptr)
-
-; A dynamically-indexed VGPR object cannot be kept in registers yet, so it falls
-; back to ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_dynamic_index(i32 %idx, i32 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_dynamic_index(
-; CHECK-SAME: i32 [[IDX:%.*]], i32 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, ptr addrspace(5) [[A1]], i32 [[IDX]]
-; CHECK-NEXT: store i32 [[V]], ptr addrspace(5) [[P2]], align 4
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- %p = getelementptr i32, ptr addrspace(13) %a, i32 %idx
- store i32 %v, ptr addrspace(13) %p
- ret void
-}
-
-; A VGPR object whose address escapes (here via a cast to a generic pointer, as
-; the frontend emits) cannot be kept in registers yet, so it falls back to
-; ordinary (addrspace(5)) scratch.
-define void @vgpr_alloca_escaping() {
-; CHECK-LABEL: define void @vgpr_alloca_escaping(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [4 x i32], align 4, addrspace(5)
-; CHECK-NEXT: [[CAST:%.*]] = addrspacecast ptr addrspace(5) [[A1]] to ptr
-; CHECK-NEXT: call void @use(ptr [[CAST]])
-; CHECK-NEXT: ret void
-;
- %a = alloca [4 x i32], align 4, addrspace(13)
- %cast = addrspacecast ptr addrspace(13) %a to ptr
- call void @use(ptr %cast)
- ret void
-}
-
-; Whole-dword-multiple accesses (here i64) stay in VGPRs.
-define void @vgpr_alloca_i64(i64 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_i64(
-; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8, addrspace(13), !amdgpu.allocated.vgprs [[META3:![0-9]+]]
-; CHECK-NEXT: store i64 [[V]], ptr addrspace(13) [[A]], align 8
-; CHECK-NEXT: ret void
-;
- %a = alloca i64, align 8, addrspace(13)
- store i64 %v, ptr addrspace(13) %a
- ret void
-}
-
-; Sub-dword accesses are not supported yet, so the object falls back to scratch.
-define void @vgpr_alloca_subdword(i16 %v) {
-; CHECK-LABEL: define void @vgpr_alloca_subdword(
-; CHECK-SAME: i16 [[V:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[A1:%.*]] = alloca [2 x i16], align 4, addrspace(5)
-; CHECK-NEXT: store i16 [[V]], ptr addrspace(5) [[A1]], align 2
-; CHECK-NEXT: ret void
-;
- %a = alloca [2 x i16], align 4, addrspace(13)
- store i16 %v, ptr addrspace(13) %a
- ret void
-}
-;.
-; CHECK: [[META0]] = !{i32 0, i32 16}
-; CHECK: [[META1]] = !{i32 0, i32 4}
-; CHECK: [[META2]] = !{i32 4, i32 8}
-; CHECK: [[META3]] = !{i32 0, i32 8}
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
deleted file mode 100644
index 0a78d119ded18..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-arch.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; "VGPR as memory" (addrspace(13)) is only enabled on gfx942/gfx950 (CDNA3+)
-; and GFX12 and later. On a supported target the object is kept in addrspace(13)
-; (and lowered to VGPRs); on any other target it falls back to addrspace(5)
-; scratch.
-
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx942 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx950 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1200 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=SUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx90a -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1030 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-; RUN: opt -S -mtriple=amdgcn -mcpu=gfx1100 -passes=amdgpu-vgpr-allocate %s 2>/dev/null | FileCheck %s --check-prefix=UNSUPP
-
-define void @vgpr_obj() {
-; SUPP: alloca [4 x i32], align 4, addrspace(13), !amdgpu.allocated.vgprs
-; UNSUPP: alloca [4 x i32], align 4, addrspace(5){{$}}
- %a = alloca [4 x i32], align 4, addrspace(13)
- store i32 0, ptr addrspace(13) %a
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll b/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
deleted file mode 100644
index ea914907a900d..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/as-vgpr-alloca-static.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; RUN: llc -O2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s -o /dev/null
-
-; "VGPR as memory" objects (allocas in addrspace(13)) accessed at constant
-; indices must lower to register copies, never to scratch/buffer memory traffic.
-
-; CHECK-LABEL: store_load_i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_i32(i32 %v) {
- %a = alloca i32, align 4, addrspace(13)
- store i32 %v, ptr addrspace(13) %a
- %l = load i32, ptr addrspace(13) %a
- %r = add i32 %l, 1
- ret i32 %r
-}
-
-; CHECK-LABEL: store_load_array:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i32 @store_load_array(i32 %v) {
- %a = alloca [4 x i32], align 4, addrspace(13)
- %p1 = getelementptr i32, ptr addrspace(13) %a, i32 1
- %p3 = getelementptr i32, ptr addrspace(13) %a, i32 3
- store i32 %v, ptr addrspace(13) %p1
- store i32 7, ptr addrspace(13) %p3
- %l1 = load i32, ptr addrspace(13) %p1
- %l3 = load i32, ptr addrspace(13) %p3
- %s = add i32 %l1, %l3
- ret i32 %s
-}
-
-; A 64-bit (two-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_i64:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define i64 @store_load_i64(i64 %v) {
- %a = alloca i64, align 8, addrspace(13)
- store i64 %v, ptr addrspace(13) %a
- %l = load i64, ptr addrspace(13) %a
- %r = add i64 %l, 1
- ret i64 %r
-}
-
-; A vector (four-dword) access is split into per-dword register copies.
-; CHECK-LABEL: store_load_v4i32:
-; CHECK-NOT: scratch_
-; CHECK-NOT: buffer_
-; CHECK: s_setpc_b64
-define <4 x i32> @store_load_v4i32(<4 x i32> %v) {
- %a = alloca <4 x i32>, align 16, addrspace(13)
- store <4 x i32> %v, ptr addrspace(13) %a
- %l = load <4 x i32>, ptr addrspace(13) %a
- ret <4 x i32> %l
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
index 94173fb7b11d2..2ddb7abc42ad4 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll
@@ -27,9 +27,9 @@
; GCN-O0-NEXT: amdgpu-lower-exec-sync
; GCN-O0-NEXT: amdgpu-sw-lower-lds
; GCN-O0-NEXT: amdgpu-lower-module-lds
+; GCN-O0-NEXT: amdgpu-lower-module-vgprs
; GCN-O0-NEXT: function
; GCN-O0-NEXT: atomic-expand
-; GCN-O0-NEXT: amdgpu-vgpr-allocate
; GCN-O0-NEXT: verify
; GCN-O0-NEXT: unreachableblockelim
; GCN-O0-NEXT: ee-instrument<post-inline>
@@ -129,6 +129,7 @@
; GCN-O2-NEXT: amdgpu-lower-exec-sync
; GCN-O2-NEXT: amdgpu-sw-lower-lds
; GCN-O2-NEXT: amdgpu-lower-module-lds
+; GCN-O2-NEXT: amdgpu-lower-module-vgprs
; GCN-O2-NEXT: function
; GCN-O2-NEXT: amdgpu-atomic-optimizer
; GCN-O2-NEXT: atomic-expand
@@ -315,6 +316,7 @@
; GCN-O3-NEXT: amdgpu-lower-exec-sync
; GCN-O3-NEXT: amdgpu-sw-lower-lds
; GCN-O3-NEXT: amdgpu-lower-module-lds
+; GCN-O3-NEXT: amdgpu-lower-module-vgprs
; GCN-O3-NEXT: function
; GCN-O3-NEXT: amdgpu-atomic-optimizer
; GCN-O3-NEXT: atomic-expand
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index aabfadd33e976..960cbb1a0def2 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,15 +47,14 @@
; GCN-O0-NEXT: AMDGPU lowering of execution synchronization
; GCN-O0-NEXT: AMDGPU Software lowering of LDS
; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O0-NEXT: AMDGPU Lower Module VGPRs
; GCN-O0-NEXT: FunctionPass Manager
; GCN-O0-NEXT: Expand Atomic instructions
-; GCN-O0-NEXT: Dominator Tree Construction
-; GCN-O0-NEXT: Natural Loop Information
-; GCN-O0-NEXT: AMDGPU VGPR Allocate
; GCN-O0-NEXT: Remove unreachable blocks from the CFG
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
+; GCN-O0-NEXT: Dominator Tree Construction
; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: AMDGPU lower intrinsics
@@ -117,7 +116,6 @@
; GCN-O0-NEXT: MachineDominator Tree Construction
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
-; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
@@ -132,6 +130,7 @@
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: AMDGPU Reserve WWM Registers
; GCN-O0-NEXT: Fast Register Allocator
+; GCN-O0-NEXT: AMDGPU Private Object VGPRs
; GCN-O0-NEXT: SI Fix VGPR copies
; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis
; GCN-O0-NEXT: Fixup Statepoint Caller Saved
@@ -210,6 +209,7 @@
; GCN-O1-NEXT: AMDGPU lowering of execution synchronization
; GCN-O1-NEXT: AMDGPU Software lowering of LDS
; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O1-NEXT: AMDGPU Lower Module VGPRs
; GCN-O1-NEXT: FunctionPass Manager
; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Cycle Info Analysis
@@ -362,7 +362,6 @@
; GCN-O1-NEXT: Live Interval Analysis
; GCN-O1-NEXT: Machine Natural Loop Construction
; GCN-O1-NEXT: Register Coalescer
-; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: Rename Disconnected Subregister Components
; GCN-O1-NEXT: Rewrite Partial Register Uses
; GCN-O1-NEXT: Machine Instruction Scheduler
@@ -402,6 +401,7 @@
; GCN-O1-NEXT: Stack Slot Coloring
; GCN-O1-NEXT: Machine Copy Propagation Pass
; GCN-O1-NEXT: Machine Loop Invariant Code Motion
+; GCN-O1-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-NEXT: SI Fix VGPR copies
; GCN-O1-NEXT: SI optimize exec mask operations
; GCN-O1-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -502,6 +502,7 @@
; GCN-O1-OPTS-NEXT: AMDGPU lowering of execution synchronization
; GCN-O1-OPTS-NEXT: AMDGPU Software lowering of LDS
; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O1-OPTS-NEXT: AMDGPU Lower Module VGPRs
; GCN-O1-OPTS-NEXT: FunctionPass Manager
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Cycle Info Analysis
@@ -680,7 +681,6 @@
; GCN-O1-OPTS-NEXT: Live Interval Analysis
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
; GCN-O1-OPTS-NEXT: Register Coalescer
-; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: Rename Disconnected Subregister Components
; GCN-O1-OPTS-NEXT: Rewrite Partial Register Uses
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
@@ -721,6 +721,7 @@
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
; GCN-O1-OPTS-NEXT: Machine Copy Propagation Pass
; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion
+; GCN-O1-OPTS-NEXT: AMDGPU Private Object VGPRs
; GCN-O1-OPTS-NEXT: SI Fix VGPR copies
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations
; GCN-O1-OPTS-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -822,6 +823,7 @@
; GCN-O2-NEXT: AMDGPU lowering of execution synchronization
; GCN-O2-NEXT: AMDGPU Software lowering of LDS
; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O2-NEXT: AMDGPU Lower Module VGPRs
; GCN-O2-NEXT: FunctionPass Manager
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Cycle Info Analysis
@@ -1003,7 +1005,6 @@
; GCN-O2-NEXT: Live Interval Analysis
; GCN-O2-NEXT: Machine Natural Loop Construction
; GCN-O2-NEXT: Register Coalescer
-; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: Rename Disconnected Subregister Components
; GCN-O2-NEXT: Rewrite Partial Register Uses
; GCN-O2-NEXT: Machine Instruction Scheduler
@@ -1045,6 +1046,7 @@
; GCN-O2-NEXT: Stack Slot Coloring
; GCN-O2-NEXT: Machine Copy Propagation Pass
; GCN-O2-NEXT: Machine Loop Invariant Code Motion
+; GCN-O2-NEXT: AMDGPU Private Object VGPRs
; GCN-O2-NEXT: SI Fix VGPR copies
; GCN-O2-NEXT: SI optimize exec mask operations
; GCN-O2-NEXT: Remove Redundant DEBUG_VALUE analysis
@@ -1146,6 +1148,7 @@
; GCN-O3-NEXT: AMDGPU lowering of execution synchronization
; GCN-O3-NEXT: AMDGPU Software lowering of LDS
; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
+; GCN-O3-NEXT: AMDGPU Lower Module VGPRs
; GCN-O3-NEXT: FunctionPass Manager
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Cycle Info Analysis
@@ -1340,7 +1343,6 @@
; GCN-O3-NEXT: Live Interval Analysis
; GCN-O3-NEXT: Machine Natural Loop Construction
; GCN-O3-NEXT: Register Coalescer
-; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: Rename Disconnected Subregister Components
; GCN-O3-NEXT: Rewrite Partial Register Uses
; GCN-O3-NEXT: Machine Instruction Scheduler
@@ -1382,6 +1384,7 @@
; GCN-O3-NEXT: Stack Slot Coloring
; GCN-O3-NEXT: Machine Copy Propagation Pass
; GCN-O3-NEXT: Machine Loop Invariant Code Motion
+; GCN-O3-NEXT: AMDGPU Private Object VGPRs
; GCN-O3-NEXT: SI Fix VGPR copies
; GCN-O3-NEXT: SI optimize exec mask operations
; GCN-O3-NEXT: Remove Redundant DEBUG_VALUE analysis
diff --git a/llvm/test/CodeGen/AMDGPU/nullptr.ll b/llvm/test/CodeGen/AMDGPU/nullptr.ll
index 1552014dc24e0..66c618782d955 100644
--- a/llvm/test/CodeGen/AMDGPU/nullptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/nullptr.ll
@@ -55,7 +55,7 @@
@nullptr12 = global ptr addrspace(12) addrspacecast (ptr null to ptr addrspace(12))
; CHECK-LABEL: nullptr13:
-; R600-NEXT: .long 0
+; CHECK-NEXT: .long -1
@nullptr13 = global ptr addrspace(13) addrspacecast (ptr null to ptr addrspace(13))
; CHECK-LABEL: nullptr14:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
index fc5dabc584863..1a73c35f83f8f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
@@ -49,6 +49,7 @@
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: AMDGPU Reserve WWM Registers
; O0-NEXT: Fast Register Allocator
+; O0-NEXT: AMDGPU Private Object VGPRs
; O0-NEXT: SI Fix VGPR copies
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
new file mode 100644
index 0000000000000..fb763cd31e339
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-constexpr.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O0
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=O2
+; RUN: llc -O0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; A "VGPR as memory" access through a constant-expression GEP must lower to a
+; register copy, not the pc-relative global-address sequence (which previously
+; crashed because addrspace(13) pointers are 32-bit). Exercised at -O0 too,
+; where the address is materialized standalone rather than folded.
+
+ at buf = internal addrspace(13) global [4 x i32] poison
+
+define void @store_constexpr_gep(i32 %v) {
+; O0-LABEL: store_constexpr_gep:
+; O0: ; %bb.0:
+; O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT: v_mov_b32_e32 v4, v0
+; O0-NEXT: s_setpc_b64 s[30:31]
+;
+; O2-LABEL: store_constexpr_gep:
+; O2: ; %bb.0:
+; O2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT: v_mov_b32_e32 v4, v0
+; O2-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+ ret void
+}
+
+define i32 @load_constexpr_gep() {
+; O0-LABEL: load_constexpr_gep:
+; O0: ; %bb.0:
+; O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O0-NEXT: v_mov_b32_e32 v0, v4
+; O0-NEXT: s_setpc_b64 s[30:31]
+;
+; O2-LABEL: load_constexpr_gep:
+; O2: ; %bb.0:
+; O2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; O2-NEXT: v_mov_b32_e32 v0, v4
+; O2-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) getelementptr inbounds (i8, ptr addrspace(13) @buf, i32 8)
+ ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
new file mode 100644
index 0000000000000..67b5d01df95b3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-dynamic.ll
@@ -0,0 +1,288 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s -o /dev/null
+
+; A runtime (non-constant) index into a "VGPR as memory" object becomes an
+; indexed move into the reserved VGPR file: s_set_gpr_idx on gfx9, movrel on
+; gfx10+, with a waterfall loop for a divergent index.
+
+ at buf = internal addrspace(13) global [16 x i32] poison
+ at buf8 = internal addrspace(13) global [16 x i8] poison
+ at buf16 = internal addrspace(13) global [16 x i16] poison
+
+define amdgpu_kernel void @dyn_uniform(ptr addrspace(1) %out, i32 %i, i32 %v) {
+; GFX942-LABEL: dyn_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_andn2_b32 s2, s2, -2.0
+; GFX942-NEXT: v_mov_b32_e32 v18, s3
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: global_store_dword v0, v18, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: s_and_b32 m0, s2, 0x3fffffff
+; GFX11-NEXT: v_movreld_b32_e32 v2, s3
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %i
+ store i32 %v, ptr addrspace(13) %p
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_load(ptr addrspace(1) %out) {
+; GFX942-LABEL: dyn_divergent_load:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s4, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB1_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dword v1, v18, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_load:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s3, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s3
+; GFX11-NEXT: v_movrels_b32_e32 v18, v2
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB1_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v1, v18, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_divergent_store(ptr addrspace(1) %out, i32 %v) {
+; GFX942-LABEL: dyn_divergent_store:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-NEXT: s_mov_b64 s[0:1], exec
+; GFX942-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s0, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB2_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_divergent_store:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x2c
+; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: s_mov_b32 s1, exec_lo
+; GFX11-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s1
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_movreld_b32_e32 v2, s0
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB2_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i32], ptr addrspace(13) @buf, i32 0, i32 %tid
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+; Sub-dword (i8/i16) at a runtime index: the containing dword is read-modify-
+; written with the bit position computed at runtime.
+define amdgpu_kernel void @dyn_i8_uniform(ptr addrspace(1) %out, i32 %i, i8 %v) {
+; GFX942-LABEL: dyn_i8_uniform:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_and_b32 s5, s2, 3
+; GFX942-NEXT: s_and_b32 s4, s3, 0xff
+; GFX942-NEXT: s_lshl_b32 s5, s5, 3
+; GFX942-NEXT: s_lshr_b32 s2, s2, 2
+; GFX942-NEXT: s_lshl_b32 s4, s4, s5
+; GFX942-NEXT: s_lshl_b32 s5, 0xff, s5
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: v_not_b32_e32 v6, s5
+; GFX942-NEXT: v_and_b32_e32 v1, v1, v6
+; GFX942-NEXT: v_or_b32_e32 v1, s4, v1
+; GFX942-NEXT: s_set_gpr_idx_on s2, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v1
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: v_mov_b32_e32 v1, s3
+; GFX942-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_i8_uniform:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s4, s2, 3
+; GFX11-NEXT: s_lshr_b32 m0, s2, 2
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: v_movrels_b32_e32 v0, v2
+; GFX11-NEXT: s_lshl_b32 s2, 0xff, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: v_not_b32_e32 v1, s2
+; GFX11-NEXT: s_and_b32 s2, s3, 0xff
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_lshl_b32 s2, s2, s4
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v0, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_or_b32_e32 v0, s2, v0
+; GFX11-NEXT: global_store_b8 v1, v6, s[0:1]
+; GFX11-NEXT: v_movreld_b32_e32 v2, v0
+; GFX11-NEXT: s_endpgm
+ %p = getelementptr [16 x i8], ptr addrspace(13) @buf8, i32 0, i32 %i
+ store i8 %v, ptr addrspace(13) %p
+ %l = load i8, ptr addrspace(13) %p
+ store i8 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @dyn_i16_divergent(ptr addrspace(1) %out, i16 %v) {
+; GFX942-LABEL: dyn_i16_divergent:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_and_b32_e32 v10, 0x3ff, v0
+; GFX942-NEXT: s_mov_b32 s2, 0xffff
+; GFX942-NEXT: v_lshlrev_b32_e32 v10, 4, v10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_and_b32 s4, s6, 0xffff
+; GFX942-NEXT: v_lshlrev_b32_e64 v11, v10, s2
+; GFX942-NEXT: v_bfe_u32 v0, v0, 1, 9
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s5, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s5, gpr_idx(SRC0)
+; GFX942-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB4_1
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: v_bfi_b32 v11, v11, 0, v12
+; GFX942-NEXT: v_lshl_or_b32 v10, s4, v10, v11
+; GFX942-NEXT: s_mov_b64 s[2:3], exec
+; GFX942-NEXT: .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_readfirstlane_b32 s4, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
+; GFX942-NEXT: s_and_saveexec_b64 vcc, vcc
+; GFX942-NEXT: s_set_gpr_idx_on s4, gpr_idx(DST)
+; GFX942-NEXT: v_mov_b32_e32 v2, v10
+; GFX942-NEXT: s_set_gpr_idx_off
+; GFX942-NEXT: s_xor_b64 exec, exec, vcc
+; GFX942-NEXT: s_cbranch_execnz .LBB4_3
+; GFX942-NEXT: ; %bb.4:
+; GFX942-NEXT: s_mov_b64 exec, s[2:3]
+; GFX942-NEXT: v_mov_b32_e32 v0, s6
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
+;
+; GFX11-LABEL: dyn_i16_divergent:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT: v_bfe_u32 v0, v0, 1, 9
+; GFX11-NEXT: s_mov_b32 s4, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v10, 4, v1
+; GFX11-NEXT: v_lshlrev_b32_e64 v11, v10, 0xffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
+; GFX11-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s5, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s5
+; GFX11-NEXT: v_movrels_b32_e32 v12, v2
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB4_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_bfi_b32 v11, v11, 0, v12
+; GFX11-NEXT: v_lshl_or_b32 v10, s3, v10, v11
+; GFX11-NEXT: s_mov_b32 s3, exec_lo
+; GFX11-NEXT: .LBB4_3: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
+; GFX11-NEXT: s_mov_b32 m0, s4
+; GFX11-NEXT: v_movreld_b32_e32 v2, v10
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
+; GFX11-NEXT: s_cbranch_execnz .LBB4_3
+; GFX11-NEXT: ; %bb.4:
+; GFX11-NEXT: s_mov_b32 exec_lo, s3
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %p = getelementptr [16 x i16], ptr addrspace(13) @buf16, i32 0, i32 %tid
+ store i16 %v, ptr addrspace(13) %p
+ %l = load i16, ptr addrspace(13) %p
+ store i16 %l, ptr addrspace(1) %out
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
new file mode 100644
index 0000000000000..0dc6dbca45480
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-gisel-fallback.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel -global-isel-abort=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+
+; GlobalISel does not yet lower "VGPR as memory" (addrspace(13)) accesses;
+; fallBackToDAGISel makes such functions fall back to SelectionDAG, which lowers
+; them to register copies rather than crashing in reg-bank legalization.
+
+ at g = internal addrspace(13) global i32 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
new file mode 100644
index 0000000000000..6da6f49a9e082
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-lower-module.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals
+; RUN: opt -mtriple=amdgcn -passes=amdgpu-lower-module-vgprs -S < %s | FileCheck %s
+
+; AMDGPULowerModuleVGPRs lays out addrspace(13) globals into per-call-graph
+; groups: disjoint kernels (@k1/@a, @k2/@b) get independent layouts and bases,
+; while functions that share a global (@writer/@reader/@g, reached from @k3)
+; share one consistent group, so the address resolves to the same registers.
+
+ at a = internal addrspace(13) global [4 x i32] poison
+ at b = internal addrspace(13) global [8 x i32] poison
+ at g = internal addrspace(13) global i32 poison
+
+;.
+; CHECK: @a = internal addrspace(13) global [4 x i32] poison, !amdgpu.vgpr.memory.offset [[META0:![0-9]+]]
+; CHECK: @b = internal addrspace(13) global [8 x i32] poison, !amdgpu.vgpr.memory.offset [[META0]]
+; CHECK: @g = internal addrspace(13) global i32 poison, !amdgpu.vgpr.memory.offset [[META0]]
+;.
+define amdgpu_kernel void @k1(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k1(
+; CHECK-NEXT: [[P:%.*]] = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT: store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %p = getelementptr [4 x i32], ptr addrspace(13) @a, i32 0, i32 1
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @k2(ptr addrspace(1) %out) {
+; CHECK-LABEL: @k2(
+; CHECK-NEXT: [[P:%.*]] = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) [[P]], align 4
+; CHECK-NEXT: store i32 [[L]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ %p = getelementptr [8 x i32], ptr addrspace(13) @b, i32 0, i32 1
+ %l = load i32, ptr addrspace(13) %p
+ store i32 %l, ptr addrspace(1) %out
+ ret void
+}
+
+define void @writer(i32 %v) {
+; CHECK-LABEL: @writer(
+; CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(13) @g, align 4
+; CHECK-NEXT: ret void
+;
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @reader() {
+; CHECK-LABEL: @reader(
+; CHECK-NEXT: [[L:%.*]] = load i32, ptr addrspace(13) @g, align 4
+; CHECK-NEXT: ret i32 [[L]]
+;
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
+
+define amdgpu_kernel void @k3(ptr addrspace(1) %out, i32 %v) {
+; CHECK-LABEL: @k3(
+; CHECK-NEXT: call void @writer(i32 [[V:%.*]])
+; CHECK-NEXT: [[R:%.*]] = call i32 @reader()
+; CHECK-NEXT: store i32 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+ call void @writer(i32 %v)
+ %r = call i32 @reader()
+ store i32 %r, ptr addrspace(1) %out
+ ret void
+}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="16" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="32" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { "amdgpu-vgpr-memory-base"="2" "amdgpu-vgpr-memory-size"="4" }
+;.
+; CHECK: [[META0]] = !{i32 0}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
new file mode 100644
index 0000000000000..44193d15016f3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory-subdword.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; Sub-dword (i8/i16) "VGPR as memory" accesses at a constant index are realized
+; as a read-modify-write of the containing dword (shifts and masks), since
+; registers have no sub-dword addressing.
+
+ at b = internal addrspace(13) global [8 x i8] poison
+ at h = internal addrspace(13) global [4 x i16] poison
+
+define void @store_i8(i8 %v) {
+; CHECK-LABEL: store_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_and_b32_e32 v1, 0xffff00ff, v1
+; CHECK-NEXT: v_and_b32_e32 v0, 0xff, v0
+; CHECK-NEXT: v_lshl_or_b32 v0, v0, 8, v1
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+ store i8 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define i8 @load_i8() {
+; CHECK-LABEL: load_i8:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: v_bfe_u32 v0, v0, 8, 8
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [8 x i8], ptr addrspace(13) @b, i32 0, i32 1
+ %l = load i8, ptr addrspace(13) %p
+ ret i8 %l
+}
+
+define void @store_i16(i16 %v) {
+; CHECK-LABEL: store_i16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; CHECK-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+ store i16 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define signext i16 @load_i16_sext() {
+; CHECK-LABEL: load_i16_sext:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 16, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i16], ptr addrspace(13) @h, i32 0, i32 1
+ %l = load i16, ptr addrspace(13) %p
+ ret i16 %l
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
new file mode 100644
index 0000000000000..b567867007fb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-as-memory.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s -o /dev/null
+
+; "VGPR as memory" (addrspace(13)) accesses at a constant index lower to plain
+; register copies to/from the reserved VGPR file - never to scratch or buffer
+; memory - and writer/reader of the same global resolve to the same register.
+
+ at g = internal addrspace(13) global i32 poison
+ at arr = internal addrspace(13) global [4 x i32] poison
+ at g64 = internal addrspace(13) global i64 poison
+
+define void @store_i32(i32 %v) {
+; CHECK-LABEL: store_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v2, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i32 %v, ptr addrspace(13) @g
+ ret void
+}
+
+define i32 @load_i32() {
+; CHECK-LABEL: load_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i32, ptr addrspace(13) @g
+ ret i32 %l
+}
+
+define void @store_arr(i32 %v) {
+; CHECK-LABEL: store_arr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+ store i32 %v, ptr addrspace(13) %p
+ ret void
+}
+
+define i32 @load_arr() {
+; CHECK-LABEL: load_arr:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, v4
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %p = getelementptr [4 x i32], ptr addrspace(13) @arr, i32 0, i32 2
+ %l = load i32, ptr addrspace(13) %p
+ ret i32 %l
+}
+
+define void @store_i64(i64 %v) {
+; CHECK-LABEL: store_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i64 %v, ptr addrspace(13) @g64
+ ret void
+}
+
+define i64 @load_i64() {
+; CHECK-LABEL: load_i64:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %l = load i64, ptr addrspace(13) @g64
+ ret i64 %l
+}
diff --git a/llvm/test/Verifier/AMDGPU/alloca.ll b/llvm/test/Verifier/AMDGPU/alloca.ll
index bd760de79c9d0..3ca15083959ad 100644
--- a/llvm/test/Verifier/AMDGPU/alloca.ll
+++ b/llvm/test/Verifier/AMDGPU/alloca.ll
@@ -2,24 +2,26 @@
target triple = "amdgcn-amd-amdhsa"
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, align 4, addrspace(9)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
+; CHECK-NEXT: %alloca.13 = alloca i32, align 4, addrspace(13)
define void @static_alloca() {
entry:
%alloca.0 = alloca i32, align 4
@@ -36,23 +38,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, i32 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, i32 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, i32 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, i32 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, i32 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, i32 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, i32 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, i32 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, i32 %n, align 4, addrspace(9)
define void @dynamic_alloca_i32(i32 %n) {
entry:
@@ -69,23 +71,23 @@ entry:
ret void
}
-; CHECK: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.0 = alloca i32, i64 %n, align 4
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.1 = alloca i32, i64 %n, align 4, addrspace(1)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.2 = alloca i32, i64 %n, align 4, addrspace(2)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.3 = alloca i32, i64 %n, align 4, addrspace(3)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.4 = alloca i32, i64 %n, align 4, addrspace(4)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.6 = alloca i32, i64 %n, align 4, addrspace(6)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.7 = alloca i32, i64 %n, align 4, addrspace(7)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.8 = alloca i32, i64 %n, align 4, addrspace(8)
-; CHECK-NEXT: alloca on amdgpu must be in addrspace(5) or addrspace(13)
+; CHECK-NEXT: alloca on amdgpu must be in addrspace(5)
; CHECK-NEXT: %alloca.9 = alloca i32, i64 %n, align 4, addrspace(9)
define void @dynamic_alloca_i64(i64 %n) {
entry:
diff --git a/llvm/test/Verifier/AMDGPU/vgpr-memory.ll b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
new file mode 100644
index 0000000000000..ebc266b6cd276
--- /dev/null
+++ b/llvm/test/Verifier/AMDGPU/vgpr-memory.ll
@@ -0,0 +1,33 @@
+; RUN: not llvm-as %s --disable-output 2>&1 | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; A "VGPR as memory" global is register-backed: it has no defined initial
+; contents and no meaningful numeric address. Diagnostics are emitted for
+; instructions before global variables, so the checks are ordered to match.
+
+; CHECK: addrspacecast to or from the VGPR address space (13) is not allowed
+; CHECK-NEXT: %cast.to = addrspacecast ptr addrspace(13) @valid.poison to ptr
+; CHECK: addrspacecast to or from the VGPR address space (13) is not allowed
+; CHECK-NEXT: %cast.from = addrspacecast ptr %p to ptr addrspace(13)
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.init
+; CHECK: global variable in the VGPR address space (13) cannot have an initializer
+; CHECK-NEXT: ptr addrspace(13) @bad.zeroinit
+
+; A poison initializer (or none) is fine.
+ at valid.poison = internal addrspace(13) global i32 poison
+ at valid.array = internal addrspace(13) global [4 x i32] poison
+
+ at bad.init = internal addrspace(13) global i32 7
+ at bad.zeroinit = internal addrspace(13) global [2 x i32] zeroinitializer
+
+define ptr @cast_from_vgpr() {
+ %cast.to = addrspacecast ptr addrspace(13) @valid.poison to ptr
+ ret ptr %cast.to
+}
+
+define ptr addrspace(13) @cast_to_vgpr(ptr %p) {
+ %cast.from = addrspacecast ptr %p to ptr addrspace(13)
+ ret ptr addrspace(13) %cast.from
+}
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index a082adbf6565e..0ec3c753c10f1 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -43,14 +43,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
// and that ANDGCN adds p7 and p8 as well.
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64", "amdgcn"),
"m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G1", "amdgcn"),
"m:e-e-p:64:64-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// Check that the old AMDGCN p8:128:128 definition is upgraded
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p8:128:128-G1", "amdgcn"),
"m:e-e-p:64:64-p8:128:128:128:48-G1-ni:7:8:9-p7:160:256:256:32-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// but that r600 does not.
EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G1", "r600"),
"m:e-e-p:32:32-G1");
@@ -66,7 +66,7 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
"m:e-e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:"
"64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:"
"1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:"
- "128:48-p9:192:256:256:32");
+ "128:48-p9:192:256:256:32-p13:32:32");
// Check that SystemZ adds -S64 if needed.
EXPECT_EQ(UpgradeDataLayoutString(
@@ -158,24 +158,24 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
EXPECT_EQ(UpgradeDataLayoutString("G2", "r600"), "m:e-G2");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G2", "amdgcn"),
"m:e-e-p:64:64-G2-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("G2-e-p:64:64", "amdgcn"),
"m:e-G2-e-p:64:64-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-G0", "amdgcn"),
"m:e-e-p:64:64-G0-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:"
- "192:256:256:32");
+ "192:256:256:32-p13:32:32");
// Check that AMDGCN targets don't add already declared address space 7.
- EXPECT_EQ(
- UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
- "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
- EXPECT_EQ(
- UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
- "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
- EXPECT_EQ(
- UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
- "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:256:32");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64", "amdgcn"),
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
+ EXPECT_EQ(UpgradeDataLayoutString("p7:64:64-G2-e-p:64:64", "amdgcn"),
+ "m:e-p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
+ EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
+ "m:e-e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
// Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
// flag.
@@ -216,9 +216,9 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
// Check that AMDGPU targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "m:e-G1");
- EXPECT_EQ(
- UpgradeDataLayoutString("", "amdgcn"),
- "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32");
+ EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
+ "m:e-G1-ni:7:8:9-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:"
+ "256:32-p13:32:32");
// Check that SPIR & SPIRV targets add G1 if it's not present.
EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
More information about the cfe-commits
mailing list