[llvm] 80ba432 - [amdgpu][nfc] Allocate kernel-specific LDS struct deterministically
Jon Chesterfield via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 28 06:55:50 PDT 2022
Author: Jon Chesterfield
Date: 2022-09-28T14:55:16+01:00
New Revision: 80ba432821206ee3ba4275d48ed6b50aadfbb9d8
URL: https://github.com/llvm/llvm-project/commit/80ba432821206ee3ba4275d48ed6b50aadfbb9d8
DIFF: https://github.com/llvm/llvm-project/commit/80ba432821206ee3ba4275d48ed6b50aadfbb9d8.diff
LOG: [amdgpu][nfc] Allocate kernel-specific LDS struct deterministically
A kernel may have an associated struct for laying out LDS variables.
This patch puts that instance, if present, at a deterministic address by
allocating it at the same time as the module scope instance.
This is relatively likely to be where the instance was allocated anyway (~NFC)
but will allow later patches to calculate where a given field can be found,
which means a function which is only reachable from a single kernel will be
able to access a LDS variable with zero overhead. That will be particularly
helpful for applications that instantiate a function template containing LDS
variables once per kernel.
Reviewed By: arsenm
Differential Revision: https://reviews.llvm.org/D127052
Added:
llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 4550cfdcf8834..da145ed7563d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -498,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateModuleLDSGlobal(F);
+ Info->allocateKnownAddressLDSGlobal(F);
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -582,7 +582,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateModuleLDSGlobal(F);
+ Info->allocateKnownAddressLDSGlobal(F);
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index f5e12fd960d0b..dacf87337d2ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -49,7 +49,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
}
unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
- const GlobalVariable &GV) {
+ const GlobalVariable &GV,
+ Align Trailing) {
auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
if (!Entry.second)
return Entry.first->second;
@@ -66,9 +67,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
- // Update the LDS size considering the padding to align the dynamic shared
- // memory.
- LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+ // Align LDS size to trailing, e.g. for aligning dynamic shared memory
+ LDSSize = alignTo(StaticLDSSize, Trailing);
} else {
assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
"expected region address space");
@@ -84,21 +84,62 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
+const GlobalVariable *
+AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
+ const Module *M = F.getParent();
+ std::string KernelLDSName = "llvm.amdgcn.kernel.";
+ KernelLDSName += F.getName();
+ KernelLDSName += ".lds";
+ return M->getNamedGlobal(KernelLDSName);
+}
+
// This kernel calls no functions that require the module lds struct
static bool canElideModuleLDS(const Function &F) {
return F.hasFnAttribute("amdgpu-elide-module-lds");
}
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
const Module *M = F.getParent();
+
+ // This function is called before allocating any other LDS so that it can
+ // reliably put values at known addresses. Consequently, dynamic LDS, if
+ // present, will not yet have been allocated
+
+ assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");
+
if (isModuleEntryFunction()) {
+
+ // Pointer values start from zero, memory allocated per-kernel-launch
+ // Variables can be grouped into a module level struct and a struct per
+ // kernel function by AMDGPULowerModuleLDSPass. If that is done, they
+ // are allocated at statically computable addresses here.
+ //
+ // Address 0
+ // {
+ // llvm.amdgcn.module.lds
+ // }
+ // alignment padding
+ // {
+ // llvm.amdgcn.kernel.some-name.lds
+ // }
+ // other variables, e.g. dynamic lds, allocated after this call
+
const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
+ const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+
if (GV && !canElideModuleLDS(F)) {
- unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+ unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
(void)Offset;
assert(Offset == 0 &&
"Module LDS expected to be allocated before other LDS");
}
+
+ if (KV) {
+ // The per-kernel offset is deterministic because it is allocated
+ // before any other non-module LDS variables.
+ unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
+ (void)Offset;
+ }
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 97db8b7eb8d6b..63a4612e7f55d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -102,8 +102,18 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return WaveLimiter;
}
- unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
- void allocateModuleLDSGlobal(const Function &F);
+ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
+ return allocateLDSGlobal(DL, GV, DynLDSAlign);
+ }
+ unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
+ Align Trailing);
+
+ void allocateKnownAddressLDSGlobal(const Function &F);
+
+ // A kernel function may have an associated LDS allocation, and a kernel-scope
+ // LDS allocation must have an associated kernel function
+ static const GlobalVariable *
+ getKernelLDSGlobalFromFunction(const Function &F);
static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 781039689efe6..dd14ca179ea4f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2369,7 +2369,7 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- Info->allocateModuleLDSGlobal(Fn);
+ Info->allocateKnownAddressLDSGlobal(Fn);
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
new file mode 100644
index 0000000000000..66e2bfaeeb444
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -0,0 +1,272 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
+
+; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is
+; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into
+; a per-kernel struct and allocated immediately after the module scope.
+; This test checks that the module and kernel scope variables are allocated in deterministic
+; order without spurious alignment padding between the two
+
+; External LDS is checked because it influences LDS padding in general and because it will
+; not be moved into either module or kernel struct
+
+ at module_variable = addrspace(3) global i16 undef
+
+; Variables are allocated into module scope block when used by a non-kernel function
+define void @use_module() #0 {
+; CHECK-LABEL: use_module:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: ds_write_b16 v0, v0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ store i16 0, i16 addrspace(3)* @module_variable
+ ret void
+}
+
+; Variables only used by kernels are specialised and allocated per-kernel
+ at kernel_normal = addrspace(3) global i16 undef
+ at kernel_overalign = addrspace(3) global i16 undef, align 4
+
+; External LDS shall not introduce padding between module and kernel scope variables
+ at extern_normal = external addrspace(3) global [0 x float]
+ at extern_overalign = external addrspace(3) global [0 x float], align 8
+
+; 2^3 cases encoded into function names
+
+define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_extern_normal:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b32 v2, v0
+; CHECK-NEXT: s_endpgm
+ store i16 2, i16 addrspace(3)* @kernel_normal
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_extern_normal:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s8, s8, s11
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s9, s9, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT: s_add_u32 s0, s0, s11
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[8:9]
+; CHECK-NEXT: s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT: s_lshl_b32 s4, s12, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
+; CHECK-NEXT: s_add_i32 s4, s4, 4
+; CHECK-NEXT: v_mov_b32_e32 v2, 2
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT: ds_write_b32 v3, v0
+; CHECK-NEXT: s_endpgm
+ call void @use_module()
+ store i16 1, i16 addrspace(3)* @module_variable
+
+ store i16 2, i16 addrspace(3)* @kernel_normal
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: s_add_i32 s0, s0, 4
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b32 v2, v0
+; CHECK-NEXT: s_endpgm
+ store i16 2, i16 addrspace(3)* @kernel_overalign
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s8, s8, s11
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s9, s9, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT: s_add_u32 s0, s0, s11
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[8:9]
+; CHECK-NEXT: s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT: s_lshl_b32 s4, s12, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
+; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: v_mov_b32_e32 v2, 2
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT: ds_write_b32 v3, v0
+; CHECK-NEXT: s_endpgm
+ call void @use_module()
+ store i16 1, i16 addrspace(3)* @module_variable
+
+ store i16 2, i16 addrspace(3)* @kernel_overalign
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b32 v2, v0
+; CHECK-NEXT: s_endpgm
+ store i16 2, i16 addrspace(3)* @kernel_normal
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s8, s8, s11
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s9, s9, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT: s_add_u32 s0, s0, s11
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[8:9]
+; CHECK-NEXT: s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT: s_lshl_b32 s4, s12, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
+; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: v_mov_b32_e32 v2, 2
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT: ds_write_b32 v3, v0
+; CHECK-NEXT: s_endpgm
+ call void @use_module()
+ store i16 1, i16 addrspace(3)* @module_variable
+
+ store i16 2, i16 addrspace(3)* @kernel_normal
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_lshl_b32 s0, s0, 2
+; CHECK-NEXT: s_add_i32 s0, s0, 8
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b32 v2, v0
+; CHECK-NEXT: s_endpgm
+ store i16 2, i16 addrspace(3)* @kernel_overalign
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s8, s8, s11
+; CHECK-NEXT: s_mov_b32 s32, 0
+; CHECK-NEXT: s_addc_u32 s9, s9, 0
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT: s_add_u32 s0, s0, s11
+; CHECK-NEXT: s_addc_u32 s1, s1, 0
+; CHECK-NEXT: s_getpc_b64 s[8:9]
+; CHECK-NEXT: s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT: s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT: s_lshl_b32 s4, s12, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_mov_b32_e32 v1, 1
+; CHECK-NEXT: s_add_i32 s4, s4, 8
+; CHECK-NEXT: v_mov_b32_e32 v2, 2
+; CHECK-NEXT: v_mov_b32_e32 v3, s4
+; CHECK-NEXT: ds_write_b16 v0, v1
+; CHECK-NEXT: ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT: ds_write_b32 v3, v0
+; CHECK-NEXT: s_endpgm
+ call void @use_module()
+ store i16 1, i16 addrspace(3)* @module_variable
+
+ store i16 2, i16 addrspace(3)* @kernel_overalign
+
+ %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+ store float 0.0, float addrspace(3)* %arrayidx1
+ ret void
+}
+
+attributes #0 = { noinline }
+attributes #1 = { "amdgpu-elide-module-lds" }
More information about the llvm-commits
mailing list