[llvm] 13e49dc - [amdgpu] Implement lower function LDS pass

Mon Mar 15 08:24:25 PDT 2021

Author: Jon Chesterfield
Date: 2021-03-15T15:24:01Z
New Revision: 13e49dcee48f7bffec17df48b87e3237aebd5b1d

URL: https://github.com/llvm/llvm-project/commit/13e49dcee48f7bffec17df48b87e3237aebd5b1d
DIFF: https://github.com/llvm/llvm-project/commit/13e49dcee48f7bffec17df48b87e3237aebd5b1d.diff

LOG: [amdgpu] Implement lower function LDS pass

[amdgpu] Implement lower function LDS pass

Local variables are allocated at kernel launch. This pass collects global
variables that are used from non-kernel functions, moves them into a new struct
type, and allocates an instance of that type in every kernel. Uses are then
replaced with a constantexpr offset.

Prior to this pass, accesses from a function are compiled to trap. With this
pass, most such accesses are removed before reaching codegen. The trap logic
is left unchanged by this pass. It is still reachable for the cases this pass
misses, notably the extern shared construct from hip and variables marked
constant which survive the optimizer.

This is of interest to the openmp project because the deviceRTL runtime library
uses cuda shared variables from functions that cannot be inlined. Trunk llvm
therefore cannot compile some openmp kernels for amdgpu. In addition to the
unit tests attached, this patch applied to ROCm llvm with fixed-abi enabled
and the function pointer hashing scheme deleted passes the openmp suite.

This lowering will use more LDS than strictly necessary. It is intended to be
a functionally correct fallback for cases that are difficult to target from
future optimisation passes.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D94648

Added: 
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
    llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
    llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
    llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 79846b82eeab..cdd59fe0b847 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -71,6 +71,7 @@ FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPULowerModuleLDSPass();
 FunctionPass *createSIModeRegisterPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
@@ -145,6 +146,13 @@ struct AMDGPUPropagateAttributesLatePass
   TargetMachine &TM;
 };
 
+void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSID;
+
+struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index e19677840f53..c7c4ed45589f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -500,9 +500,10 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
   const DataLayout &DL = F.getParent()->getDataLayout();
 
+  Info->allocateModuleLDSGlobal(F.getParent());
+
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
 
@@ -591,6 +592,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
+  Info->allocateModuleLDSGlobal(F.getParent());
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
new file mode 100644
index 000000000000..fda044ad7c89
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -0,0 +1,380 @@
+//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-module-lds"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerModuleLDS : public ModulePass {
+
+  static bool isKernelCC(Function *Func) {
+    return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+  }
+
+  static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+    return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                         GV->getValueType());
+  }
+
+  static bool
+  userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                       User *InitialUser) {
+    // Any LDS variable can be lowered by moving into the created struct
+    // Each variable so lowered is allocated in every kernel, so variables
+    // whose users are all known to be safe to lower without the transform
+    // are left unchanged.
+    SmallPtrSet<User *, 8> Visited;
+    SmallVector<User *, 16> Stack;
+    Stack.push_back(InitialUser);
+
+    while (!Stack.empty()) {
+      User *V = Stack.pop_back_val();
+      Visited.insert(V);
+
+      if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
+        if (UsedList.contains(G)) {
+          continue;
+        }
+      }
+
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        if (isKernelCC(I->getFunction())) {
+          continue;
+        }
+      }
+
+      if (auto *E = dyn_cast<ConstantExpr>(V)) {
+        for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
+             ++EU) {
+          if (Visited.insert(*EU).second) {
+            Stack.push_back(*EU);
+          }
+        }
+        continue;
+      }
+
+      // Unknown user, conservatively lower the variable
+      return true;
+    }
+
+    return false;
+  }
+
+  static std::vector<GlobalVariable *>
+  findVariablesToLower(Module &M,
+                       const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+    std::vector<llvm::GlobalVariable *> LocalVars;
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // addrspace(3) without initializer implies cuda/hip extern __shared__
+        // the semantics for such a variable appears to be that all extern
+        // __shared__ variables alias one another, in which case this transform
+        // is not required
+        continue;
+      }
+      if (!isa<UndefValue>(GV.getInitializer())) {
+        // Initializers are unimplemented for local address space.
+        // Leave such variables in place for consistent error reporting.
+        continue;
+      }
+      if (GV.isConstant()) {
+        // A constant undef variable can't be written to, and any load is
+        // undef, so it should be eliminated by the optimizer. It could be
+        // dropped by the back end if not. This pass skips over it.
+        continue;
+      }
+      if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
+            return userRequiresLowering(UsedList, U);
+          })) {
+        continue;
+      }
+      LocalVars.push_back(&GV);
+    }
+    return LocalVars;
+  }
+
+  static void removeFromUsedList(Module &M, StringRef Name,
+                                 SmallPtrSetImpl<Constant *> &ToRemove) {
+    GlobalVariable *GV = M.getGlobalVariable(Name);
+    if (!GV || ToRemove.empty()) {
+      return;
+    }
+
+    SmallVector<Constant *, 16> Init;
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      // ModuleUtils::appendToUsed only inserts Constants
+      Constant *C = cast<Constant>(Op);
+      if (!ToRemove.contains(C->stripPointerCasts())) {
+        Init.push_back(C);
+      }
+    }
+
+    if (Init.size() == CA->getNumOperands()) {
+      return; // none to remove
+    }
+
+    GV->eraseFromParent();
+
+    if (!Init.empty()) {
+      ArrayType *ATy =
+          ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+      GV =
+          new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                   ConstantArray::get(ATy, Init), Name);
+      GV->setSection("llvm.metadata");
+    }
+  }
+
+  static void
+  removeFromUsedLists(Module &M,
+                      const std::vector<GlobalVariable *> &LocalVars) {
+    SmallPtrSet<Constant *, 32> LocalVarsSet;
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+        LocalVarsSet.insert(C);
+      }
+    }
+    removeFromUsedList(M, "llvm.used", LocalVarsSet);
+    removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+  }
+
+  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+                               GlobalVariable *SGV) {
+    // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
+    // that might call a function which accesses a field within it. This is
+    // presently approximated to 'all kernels' if there are any such functions
+    // in the module. This implicit use is reified as an explicit use here so
+    // that later passes, specifically PromoteAlloca, account for the required
+    // memory without any knowledge of this transform.
+
+    // An operand bundle on llvm.donothing works because the call instruction
+    // survives until after the last pass that needs to account for LDS. It is
+    // better than inline asm as the latter survives until the end of codegen. A
+    // totally robust solution would be a function with the same semantics as
+    // llvm.donothing that takes a pointer to the instance and is lowered to a
+    // no-op after LDS is allocated, but that is not presently necessary.
+
+    LLVMContext &Ctx = Func->getContext();
+
+    Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+
+    Function *Decl =
+        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+    Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
+        SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+
+    Builder.CreateCall(FTy, Decl, {},
+                       {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
+                       "");
+  }
+
+  static SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+    SmallPtrSet<GlobalValue *, 32> UsedList;
+
+    SmallVector<GlobalValue *, 32> TmpVec;
+    collectUsedGlobalVariables(M, TmpVec, true);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    TmpVec.clear();
+    collectUsedGlobalVariables(M, TmpVec, false);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    return UsedList;
+  }
+
+public:
+  static char ID;
+
+  AMDGPULowerModuleLDS() : ModulePass(ID) {
+    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    SmallPtrSet<GlobalValue *, 32> UsedList = getUsedList(M);
+
+    // Find variables to move into new struct instance
+    std::vector<GlobalVariable *> FoundLocalVars =
+        findVariablesToLower(M, UsedList);
+
+    if (FoundLocalVars.empty()) {
+      // No variables to rewrite, no changes made.
+      return false;
+    }
+
+    // Sort by alignment, descending, to minimise padding.
+    // On ties, sort by size, descending, then by name, lexicographical.
+    llvm::stable_sort(
+        FoundLocalVars,
+        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+          Align ALHS = getAlign(DL, LHS);
+          Align ARHS = getAlign(DL, RHS);
+          if (ALHS != ARHS) {
+            return ALHS > ARHS;
+          }
+
+          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+          if (SLHS != SRHS) {
+            return SLHS > SRHS;
+          }
+
+          // By variable name on tie for predictable order in test cases.
+          return LHS->getName() < RHS->getName();
+        });
+
+    std::vector<GlobalVariable *> LocalVars;
+    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+    {
+      // This usually won't need to insert any padding, perhaps avoid the alloc
+      uint64_t CurrentOffset = 0;
+      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
+        GlobalVariable *FGV = FoundLocalVars[I];
+        Align DataAlign = getAlign(DL, FGV);
+
+        uint64_t DataAlignV = DataAlign.value();
+        if (uint64_t Rem = CurrentOffset % DataAlignV) {
+          uint64_t Padding = DataAlignV - Rem;
+
+          // Append an array of padding bytes to meet alignment requested
+          // Note (o +      (a - (o % a)) ) % a == 0
+          //      (offset + Padding       ) % align == 0
+
+          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+          LocalVars.push_back(new GlobalVariable(
+              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+              false));
+          CurrentOffset += Padding;
+        }
+
+        LocalVars.push_back(FGV);
+        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+      }
+    }
+
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(
+        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+
+    Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Constant *InstanceAddress = Constant::getIntegerValue(
+        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+
+    GlobalVariable *SGV = new GlobalVariable(
+        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
+        AMDGPUAS::LOCAL_ADDRESS, false);
+    SGV->setAlignment(MaxAlign);
+    appendToCompilerUsed(
+        M, {static_cast<GlobalValue *>(
+               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+    removeFromUsedLists(M, LocalVars);
+
+    // Replace uses of ith variable with a constantexpr to the ith field of the
+    // instance that will be allocated by AMDGPUMachineFunction
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      GV->replaceAllUsesWith(
+          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+      GV->eraseFromParent();
+    }
+
+    // Mark kernels with asm that reads the address of the allocated structure
+    // This is not necessary for lowering. This lets other passes, specifically
+    // PromoteAlloca, accurately calculate how much LDS will be used by the
+    // kernel after lowering.
+    {
+      IRBuilder<> Builder(Ctx);
+      SmallPtrSet<Function *, 32> Kernels;
+      for (auto &I : M.functions()) {
+        Function *Func = &I;
+        if (isKernelCC(Func) && !Kernels.contains(Func)) {
+          markUsedByKernel(Builder, Func, SGV);
+          Kernels.insert(Func);
+        }
+      }
+    }
+    return true;
+  }
+};
+
+} // namespace
+char AMDGPULowerModuleLDS::ID = 0;
+
+char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
+                "Lower uses of LDS variables from non-kernel functions", false,
+                false)
+
+ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
+  return new AMDGPULowerModuleLDS();
+}
+
+PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &) {
+  return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
+                                               : PreservedAnalyses::all();
+}

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 717145b7af53..51344976466e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -64,6 +64,18 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
   return Offset;
 }
 
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+  if (isModuleEntryFunction()) {
+    GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
+    if (GV) {
+      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+      (void)Offset;
+      assert(Offset == 0 &&
+             "Module LDS expected to be allocated before other LDS");
+    }
+  }
+}
+
 void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
                                            const GlobalVariable &GV) {
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 07cac776082d..10ff50040c6a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -94,6 +94,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+  void allocateModuleLDSGlobal(const Module *M);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index d104ddc1bf99..9253775ef84b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -126,8 +126,13 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
 char AMDGPUPromoteAlloca::ID = 0;
 char AMDGPUPromoteAllocaToVector::ID = 0;
 
-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                      "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                    "AMDGPU promote alloca to vector or LDS", false, false)
 
 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                 "AMDGPU promote alloca to vector", false, false)

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index fe81a3f10d6c..9db4e8c8472f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,6 +193,11 @@ static cl::opt<bool> EnableStructurizerWorkarounds(
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool>
+    DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden,
+                          cl::desc("Disable lower module lds pass"),
+                          cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -235,6 +240,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
+  initializeAMDGPULowerModuleLDSPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
@@ -506,6 +512,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-lower-module-lds") {
+          PM.addPass(AMDGPULowerModuleLDSPass());
+          return true;
+        }
         return false;
       });
   PB.registerPipelineParsingCallback(
@@ -535,7 +545,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB,
           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
           return true;
         }
-
         return false;
       });
 
@@ -884,6 +893,10 @@ void AMDGPUPassConfig::addIRPasses() {
   // Replace OpenCL enqueued block function pointers with global variables.
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
 
+  // Can increase LDS used by kernel so runs before PromoteAlloca
+  if (!DisableLowerModuleLDS)
+    addPass(createAMDGPULowerModuleLDSPass());
+
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca());

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 71f2026c3326..7aa256821167 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -67,6 +67,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
+  AMDGPULowerModuleLDSPass.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d48b23b94b60..acdfb7eeea7d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2263,6 +2263,8 @@ SDValue SITargetLowering::LowerFormalArguments(
     return DAG.getEntryNode();
   }
 
+  Info->allocateModuleLDSGlobal(Fn.getParent());
+
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;
   BitVector Skipped(Ins.size());

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
index 291cf966c701..4473c86a6e60 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
index 223efcc73818..7b2e03e1aae8 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
index dc6c2684c5b1..823c6a001ff9 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
new file mode 100644
index 000000000000..3ea440dff872
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, float }
+
+ at func = addrspace(3) global float undef, align 4
+
+; @kern is only used from a kernel so it is left unchanged
+; CHECK: @kern = addrspace(3) global float undef, align 4
+ at kern = addrspace(3) global float undef, align 4
+
+; @func is only used from a non-kernel function so is rewritten
+; CHECK-NOT: @func
+; @both is used from a non-kernel function so is rewritten
+; CHECK-NOT: @both
+; sorted both < func, so @both at null and @func at 4
+ at both = addrspace(3) global float undef, align 4
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+
+; CHECK-LABEL: @get_func()
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+define i32 @get_func() local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: @set_func(i32 %x)
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define void @set_func(i32 %x) local_unnamed_addr #1 {
+entry:
+  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @timestwo()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %mul = mul i32 %ld, 2
+; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define amdgpu_kernel void @timestwo() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  %mul = mul i32 %ld, 2
+  store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
new file mode 100644
index 000000000000..f241ab69c32a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -0,0 +1,68 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Variables that are not lowered by this pass are left unchanged
+; CHECK-NOT: asm
+; CHECK-NOT: llvm.amdgcn.module.lds
+; CHECK-NOT: llvm.amdgcn.module.lds.t
+
+; var1, var2 would be transformed were they used from a non-kernel function
+; CHECK: @var1 = addrspace(3) global i32 undef
+; CHECK: @var2 = addrspace(3) global float undef
+ at var1 = addrspace(3) global i32 undef
+ at var2 = addrspace(3) global float undef
+
+; constant variables are left to the optimizer / error diagnostics
+; CHECK: @const_undef = addrspace(3) constant i32 undef
+; CHECK: @const_with_init = addrspace(3) constant i64 8
+ at const_undef = addrspace(3) constant i32 undef
+ at const_with_init = addrspace(3) constant i64 8
+
+; External and constant are both left to the optimizer / error diagnostics
+; CHECK: @extern = external addrspace(3) global i32
+ at extern = external addrspace(3) global i32
+
+; Use of an addrspace(3) variable with an initializer is skipped,
+; so as to preserve the unimplemented error from llc
+; CHECK: @with_init = addrspace(3) global i64 0
+ at with_init = addrspace(3) global i64 0
+
+; Only local addrspace variables are transformed
+; CHECK: @addr4 = addrspace(4) global i64 undef
+ at addr4 = addrspace(4) global i64 undef
+
+; Assign to self is treated as any other initializer, i.e. ignored by this pass
+; CHECK: @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+ at toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+
+; Use by .used lists doesn't trigger lowering
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+ at llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; Access from a function would cause lowering for non-excluded cases
+; CHECK-LABEL: @use_variables()
+; CHECK: %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+; CHECK: %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+; CHECK: %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+; CHECK: %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+; CHECK: %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+define void @use_variables() {
+  %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+  %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+  %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+  %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+  %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+  ret void
+}
+
+; Use by kernel doesn't trigger lowering
+; CHECK-LABEL: @kern_use()
+; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+define amdgpu_kernel void @kern_use() {
+  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+  call void @use_variables()
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
new file mode 100644
index 000000000000..2da2bc04c8fe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
+
+; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
+
+; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+ at function_target = addrspace(3) global float undef, align 4
+ at function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8
+
+ at kernel_target = addrspace(3) global double undef, align 8
+ at kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8
+
+; CHECK-LABEL: @function(float %x)
+; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+define void @function(float %x) local_unnamed_addr #5 {
+entry:
+  %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+  store float %x, float* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: @kernel(double %x)
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 {
+entry:
+  %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+  store double %x, double* %0, align 8
+  ret void
+}
+
+
+
+

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
new file mode 100644
index 000000000000..ce45cf1849ce
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Check new struct is added to compiler.used and that the replaced variable is removed
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float }
+; CHECK: @ignored = addrspace(1) global i64 0
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; CHECK-NOT: @tolower
+
+ at tolower = addrspace(3) global float undef, align 8
+
+; A variable that is unchanged by pass
+ at ignored = addrspace(1) global i64 0
+
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; Start with one value to replace and one to ignore in the .use list
+
+; @ignored still in list, @tolower removed
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+ at llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+ at llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
+  %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
new file mode 100644
index 000000000000..3c7313802bda
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
+; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+
+; Variables removed by pass
+; CHECK-NOT: @var0
+; CHECK-NOT: @var1
+
+ at var0 = addrspace(3) global float undef, align 8
+ at var1 = addrspace(3) global i32 undef, align 8
+
+ at ptr =  addrspace(1) global i32 addrspace(3)* @var1, align 4
+
+; A variable that is unchanged by pass
+; CHECK: @with_init = addrspace(3) global i64 0
+ at with_init = addrspace(3) global i64 0
+
+; Instance of new type, aligned to max of element alignment
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; Use in func rewritten to access struct at address zero, which prints as null
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %val1 = add i32 %val0, 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
+  %val0 = load i32, i32 addrspace(3)* @var1, align 4
+  %val1 = add i32 %val0, 4
+  store i32 %val1, i32 addrspace(3)* @var1, align 4
+  %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+  ret void
+}
+
+; This kernel calls a function that uses LDS so needs the block
+; CHECK-LABEL: @kern_call()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: call void @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+define amdgpu_kernel void @kern_call() {
+  call void @func()
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
+  ret void
+}
+
+; This kernel does not need to alloc the LDS block as it makes no calls
+; CHECK-LABEL: @kern_empty()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+define spir_kernel void @kern_empty() {
+  ret void
+}

diff  --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
index 4e23875502e9..0ee1cc5886de 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s
 
 target datalayout = "A5"