[llvm] 80ba432 - [amdgpu][nfc] Allocate kernel-specific LDS struct deterministically

Wed Sep 28 06:55:50 PDT 2022

Author: Jon Chesterfield
Date: 2022-09-28T14:55:16+01:00
New Revision: 80ba432821206ee3ba4275d48ed6b50aadfbb9d8

URL: https://github.com/llvm/llvm-project/commit/80ba432821206ee3ba4275d48ed6b50aadfbb9d8
DIFF: https://github.com/llvm/llvm-project/commit/80ba432821206ee3ba4275d48ed6b50aadfbb9d8.diff

LOG: [amdgpu][nfc] Allocate kernel-specific LDS struct deterministically

A kernel may have an associated struct for laying out LDS variables.
This patch puts that instance, if present, at a deterministic address by
allocating it at the same time as the module scope instance.

This is relatively likely to be where the instance was allocated anyway (~NFC)
but will allow later patches to calculate where a given field can be found,
which means a function which is only reachable from a single kernel will be
able to access a LDS variable with zero overhead. That will be particularly
helpful for applications that instantiate a function template containing LDS
variables once per kernel.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D127052

Added: 
    llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 4550cfdcf8834..da145ed7563d5 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -498,7 +498,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -582,7 +582,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
-  Info->allocateModuleLDSGlobal(F);
+  Info->allocateKnownAddressLDSGlobal(F);
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index f5e12fd960d0b..dacf87337d2ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -49,7 +49,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
-                                                  const GlobalVariable &GV) {
+                                                  const GlobalVariable &GV,
+                                                  Align Trailing) {
   auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
   if (!Entry.second)
     return Entry.first->second;
@@ -66,9 +67,8 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
 
     StaticLDSSize += DL.getTypeAllocSize(GV.getValueType());
 
-    // Update the LDS size considering the padding to align the dynamic shared
-    // memory.
-    LDSSize = alignTo(StaticLDSSize, DynLDSAlign);
+    // Align LDS size to trailing, e.g. for aligning dynamic shared memory
+    LDSSize = alignTo(StaticLDSSize, Trailing);
   } else {
     assert(GV.getAddressSpace() == AMDGPUAS::REGION_ADDRESS &&
            "expected region address space");
@@ -84,21 +84,62 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
+const GlobalVariable *
+AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  std::string KernelLDSName = "llvm.amdgcn.kernel.";
+  KernelLDSName += F.getName();
+  KernelLDSName += ".lds";
+  return M->getNamedGlobal(KernelLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
 }
 
-void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
+void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
   const Module *M = F.getParent();
+
+  // This function is called before allocating any other LDS so that it can
+  // reliably put values at known addresses. Consequently, dynamic LDS, if
+  // present, will not yet have been allocated
+
+  assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");
+
   if (isModuleEntryFunction()) {
+
+    // Pointer values start from zero, memory allocated per-kernel-launch
+    // Variables can be grouped into a module level struct and a struct per
+    // kernel function by AMDGPULowerModuleLDSPass. If that is done, they
+    // are allocated at statically computable addresses here.
+    //
+    // Address 0
+    // {
+    //   llvm.amdgcn.module.lds
+    // }
+    // alignment padding
+    // {
+    //   llvm.amdgcn.kernel.some-name.lds
+    // }
+    // other variables, e.g. dynamic lds, allocated after this call
+
     const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
+    const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+
     if (GV && !canElideModuleLDS(F)) {
-      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
       (void)Offset;
       assert(Offset == 0 &&
              "Module LDS expected to be allocated before other LDS");
     }
+
+    if (KV) {
+      // The per-kernel offset is deterministic because it is allocated
+      // before any other non-module LDS variables.
+      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
+      (void)Offset;
+    }
   }
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 97db8b7eb8d6b..63a4612e7f55d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -102,8 +102,18 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return WaveLimiter;
   }
 
-  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
-  void allocateModuleLDSGlobal(const Function &F);
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
+    return allocateLDSGlobal(DL, GV, DynLDSAlign);
+  }
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
+                             Align Trailing);
+
+  void allocateKnownAddressLDSGlobal(const Function &F);
+
+  // A kernel function may have an associated LDS allocation, and a kernel-scope
+  // LDS allocation must have an associated kernel function
+  static const GlobalVariable *
+  getKernelLDSGlobalFromFunction(const Function &F);
 
   static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 781039689efe6..dd14ca179ea4f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2369,7 +2369,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     return DAG.getEntryNode();
   }
 
-  Info->allocateModuleLDSGlobal(Fn);
+  Info->allocateKnownAddressLDSGlobal(Fn);
 
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
new file mode 100644
index 0000000000000..66e2bfaeeb444
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -0,0 +1,272 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s
+
+; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is
+; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into
+; a per-kernel struct and allocated immediately after the module scope.
+; This test checks that the module and kernel scope variables are allocated in deterministic
+; order without spurious alignment padding between the two
+
+; External LDS is checked because it influences LDS padding in general and because it will
+; not be moved into either module or kernel struct
+
+ at module_variable = addrspace(3) global i16 undef
+
+; Variables are allocated into module scope block when used by a non-kernel function
+define void @use_module() #0 {
+; CHECK-LABEL: use_module:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    ds_write_b16 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  store i16 0, i16 addrspace(3)* @module_variable
+  ret void
+}
+
+; Variables only used by kernels are specialised and allocated per-kernel
+ at kernel_normal = addrspace(3) global i16 undef
+ at kernel_overalign = addrspace(3) global i16 undef, align 4
+
+; External LDS shall not introduce padding between module and kernel scope variables
+ at extern_normal = external addrspace(3) global [0 x float]
+ at extern_overalign = external addrspace(3) global [0 x float], align 8
+
+; 2^3 cases encoded into function names
+
+define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    s_endpgm
+  store i16 2, i16 addrspace(3)* @kernel_normal
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT:    s_lshl_b32 s4, s12, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_add_i32 s4, s4, 4
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, i16 addrspace(3)* @module_variable
+
+  store i16 2, i16 addrspace(3)* @kernel_normal
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    s_add_i32 s0, s0, 4
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    s_endpgm
+  store i16 2, i16 addrspace(3)* @kernel_overalign
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT:    s_lshl_b32 s4, s12, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, i16 addrspace(3)* @module_variable
+
+  store i16 2, i16 addrspace(3)* @kernel_overalign
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_normal, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    s_endpgm
+  store i16 2, i16 addrspace(3)* @kernel_normal
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT:    s_lshl_b32 s4, s12, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, i16 addrspace(3)* @module_variable
+
+  store i16 2, i16 addrspace(3)* @kernel_normal
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_lshl_b32 s0, s0, 2
+; CHECK-NEXT:    s_add_i32 s0, s0, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b32 v2, v0
+; CHECK-NEXT:    s_endpgm
+  store i16 2, i16 addrspace(3)* @kernel_overalign
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s8, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s9, s9, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[6:7], 0x0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[10:11]
+; CHECK-NEXT:    s_lshl_b32 s4, s12, 2
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    s_add_i32 s4, s4, 8
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    ds_write_b32 v3, v0
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, i16 addrspace(3)* @module_variable
+
+  store i16 2, i16 addrspace(3)* @kernel_overalign
+
+  %arrayidx1 = getelementptr inbounds [0 x float], [0 x float] addrspace(3)* @extern_overalign, i32 0, i32 %idx
+  store float 0.0, float addrspace(3)* %arrayidx1
+  ret void
+}
+
+attributes #0 = { noinline }
+attributes #1 = { "amdgpu-elide-module-lds" }