[llvm] 0507448 - [amdgpu] Implement dynamic LDS accesses from non-kernel functions

Tue Apr 4 12:06:50 PDT 2023

Author: Jon Chesterfield
Date: 2023-04-04T20:06:34+01:00
New Revision: 0507448d829818e29f7d8df6652002c8cc5683d1

URL: https://github.com/llvm/llvm-project/commit/0507448d829818e29f7d8df6652002c8cc5683d1
DIFF: https://github.com/llvm/llvm-project/commit/0507448d829818e29f7d8df6652002c8cc5683d1.diff

LOG: [amdgpu] Implement dynamic LDS accesses from non-kernel functions

The premise here is to allow non-kernel functions to locate external LDS variables without using LDS or extra magic SGPRs to do so.

1/ First it crawls the callgraph to work out which external LDS variables are reachable from a given kernel
2/ Then it creates a new `extern char[0]` variable for each kernel, which will alias all the other extern LDS variables because that's the documented behaviour of these variables
3/ The address of that variable is written to a lookup table. The global variable is tagged with metadata to track what address it was allocated at by codegen
4/ The assembler builds the lookup table using the metadata
5/ Any non-kernel functions use the same magic intrinsic used by table lookups of non-dynamic LDS variables to find the address to use

Heavy overlap with the code paths taken for other lowering, in particular the same intrinsic is used to pass the dynamic scope information through the same sgpr as for table lookups of static LDS.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D144233

Added: 
    llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
    llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
    llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
    llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
    llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 2e24e9f929d2a..b53def912ab61 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -127,7 +127,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
     unsigned AS = GV.getAddressSpace();
     if ((AS == AMDGPUAS::REGION_ADDRESS) ||
         (AS == AMDGPUAS::LOCAL_ADDRESS &&
-         (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer())))
+         (!AMDGPUTargetMachine::EnableLowerModuleLDS)))
       recursivelyVisitUsers(GV, FuncsToAlwaysInline);
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ef234b0014023..209c90d2cb620 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2542,7 +2542,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
       // allocated ones. They all share the same offset.
       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+        MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
         LLT S32 = LLT::scalar(32);
         auto Sz =
             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index c9f8d91407173..e020e50ca26de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -470,7 +470,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
   void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
                                  GlobalVariable *LookupTable,
-                                 GlobalVariable *GV, Use &U, Value *Index) {
+                                 GlobalVariable *GV, Use &U,
+                                 Value *OptionalIndex) {
     // Table is a constant array of the same length as OrderedKernels
     LLVMContext &Ctx = M.getContext();
     Type *I32 = Type::getInt32Ty(Ctx);
@@ -485,11 +486,12 @@ class AMDGPULowerModuleLDS : public ModulePass {
       Builder.SetInsertPoint(I);
     }
 
-    Value *GEPIdx[3] = {
+    SmallVector<Value *, 3> GEPIdx = {
         ConstantInt::get(I32, 0),
         tableKernelIndex,
-        Index,
     };
+    if (OptionalIndex)
+      GEPIdx.push_back(OptionalIndex);
 
     Value *Address = Builder.CreateInBoundsGEP(
         LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
@@ -641,21 +643,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
   }
 
   static std::vector<Function *> assignLDSKernelIDToEachKernel(
-      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS) {
+      Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
     // Associate kernels in the set with an arbirary but reproducible order and
     // annotate them with that order in metadata. This metadata is recognised by
     // the backend and lowered to a SGPR which can be read from using
     // amdgcn_lds_kernel_id.
 
     std::vector<Function *> OrderedKernels;
-    {
+    if (!KernelsThatAllocateTableLDS.empty() ||
+        !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+
       for (Function &Func : M->functions()) {
         if (Func.isDeclaration())
           continue;
         if (!isKernelLDS(&Func))
           continue;
 
-        if (KernelsThatAllocateTableLDS.contains(&Func)) {
+        if (KernelsThatAllocateTableLDS.contains(&Func) ||
+            KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) {
           assert(Func.hasName()); // else fatal error earlier
           OrderedKernels.push_back(&Func);
         }
@@ -692,7 +698,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
       VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
       DenseSet<GlobalVariable *> &ModuleScopeVariables,
       DenseSet<GlobalVariable *> &TableLookupVariables,
-      DenseSet<GlobalVariable *> &KernelAccessVariables) {
+      DenseSet<GlobalVariable *> &KernelAccessVariables,
+      DenseSet<GlobalVariable *> &DynamicVariables) {
 
     GlobalVariable *HybridModuleRoot =
         LoweringKindLoc != LoweringKind::hybrid
@@ -714,6 +721,11 @@ class AMDGPULowerModuleLDS : public ModulePass {
       assert(AMDGPU::isLDSVariableToLower(*GV));
       assert(K.second.size() != 0);
 
+      if (AMDGPU::isDynamicLDS(*GV)) {
+        DynamicVariables.insert(GV);
+        continue;
+      }
+
       switch (LoweringKindLoc) {
       case LoweringKind::module:
         ModuleScopeVariables.insert(GV);
@@ -752,7 +764,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
     // All LDS variables accessed indirectly have now been partitioned into
     // the distinct lowering strategies.
     assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
-               KernelAccessVariables.size() ==
+               KernelAccessVariables.size() + DynamicVariables.size() ==
            LDSToKernelsThatNeedToAccessItIndirectly.size());
   }
 
@@ -822,7 +834,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
         markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
 
       } else {
-        Func.addFnAttr("amdgpu-elide-module-lds");
+        markElideModuleLDS(Func);
       }
     }
 
@@ -847,13 +859,17 @@ class AMDGPULowerModuleLDS : public ModulePass {
       // Allocating variables that are used directly in this struct to get
       // alignment aware allocation and predictable frame size.
       for (auto &v : LDSUsesInfo.direct_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!AMDGPU::isDynamicLDS(*v)) {
+          KernelUsedVariables.insert(v);
+        }
       }
 
       // Allocating variables that are accessed indirectly so that a lookup of
       // this struct instance can find them from nested functions.
       for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
-        KernelUsedVariables.insert(v);
+        if (!AMDGPU::isDynamicLDS(*v)) {
+          KernelUsedVariables.insert(v);
+        }
       }
 
       // Variables allocated in module lds must all resolve to that struct,
@@ -866,6 +882,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
       if (KernelUsedVariables.empty()) {
         // Either used no LDS, or the LDS it used was all in the module struct
+        // or dynamically sized
         continue;
       }
 
@@ -885,33 +902,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
       auto Replacement =
           createLDSVariableReplacement(M, VarName, KernelUsedVariables);
 
-      // This struct is allocated at a predictable address that can be
-      // calculated now, recorded in metadata then used to lower references to
-      // it during codegen.
-      {
-        // frame layout, starting from 0
-        //{
-        //  module.lds
-        //  alignment padding
-        //  kernel instance
-        //}
-
-        if (!MaybeModuleScopeStruct ||
-            Func.hasFnAttribute("amdgpu-elide-module-lds")) {
-          // There's no module.lds for this kernel so this replacement struct
-          // goes first
-          recordLDSAbsoluteAddress(&M, Replacement.SGV, 0);
-        } else {
-          const DataLayout &DL = M.getDataLayout();
-          TypeSize ModuleSize =
-              DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
-          GlobalVariable *KernelStruct = Replacement.SGV;
-          Align KernelAlign = AMDGPU::getAlign(DL, KernelStruct);
-          recordLDSAbsoluteAddress(&M, Replacement.SGV,
-                                   alignTo(ModuleSize, KernelAlign));
-        }
-      }
-
       // remove preserves existing codegen
       removeLocalVarsFromUsedLists(M, KernelUsedVariables);
       KernelToReplacement[&Func] = Replacement;
@@ -926,6 +916,121 @@ class AMDGPULowerModuleLDS : public ModulePass {
     return KernelToReplacement;
   }
 
+  static GlobalVariable *
+  buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
+                                        Function *func) {
+    // Create a dynamic lds variable with a name associated with the passed
+    // function that has the maximum alignment of any dynamic lds variable
+    // reachable from this kernel. Dynamic LDS is allocated after the static LDS
+    // allocation, possibly after alignment padding. The representative variable
+    // created here has the maximum alignment of any other dynamic variable
+    // reachable by that kernel. All dynamic LDS variables are allocated at the
+    // same address in each kernel in order to provide the documented aliasing
+    // semantics. Setting the alignment here allows this IR pass to accurately
+    // predict the exact constant at which it will be allocated.
+
+    assert(isKernelLDS(func));
+
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    Align MaxDynamicAlignment(1);
+
+    auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
+      if (AMDGPU::isDynamicLDS(*GV)) {
+        MaxDynamicAlignment =
+            std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
+      }
+    };
+
+    for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
+      UpdateMaxAlignment(GV);
+    }
+
+    assert(func->hasName()); // Checked by caller
+    auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+    GlobalVariable *N = new GlobalVariable(
+        M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+        Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+        false);
+    N->setAlignment(MaxDynamicAlignment);
+
+    assert(AMDGPU::isDynamicLDS(*N));
+    return N;
+  }
+
+  DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
+      Module &M, LDSUsesInfoTy &LDSUsesInfo,
+      DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
+      DenseSet<GlobalVariable *> const &DynamicVariables,
+      std::vector<Function *> const &OrderedKernels) {
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
+    if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+      LLVMContext &Ctx = M.getContext();
+      IRBuilder<> Builder(Ctx);
+      Type *I32 = Type::getInt32Ty(Ctx);
+
+      std::vector<Constant *> newDynamicLDS;
+
+      // Table is built in the same order as OrderedKernels
+      for (auto &func : OrderedKernels) {
+
+        if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
+          assert(isKernelLDS(func));
+          if (!func->hasName()) {
+            report_fatal_error("Anonymous kernels cannot use LDS variables");
+          }
+
+          GlobalVariable *N =
+              buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
+
+          KernelToCreatedDynamicLDS[func] = N;
+
+          markUsedByKernel(Builder, func, N);
+
+          auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+          auto GEP = ConstantExpr::getGetElementPtr(
+              emptyCharArray, N, ConstantInt::get(I32, 0), true);
+          newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
+        } else {
+          newDynamicLDS.push_back(PoisonValue::get(I32));
+        }
+      }
+      assert(OrderedKernels.size() == newDynamicLDS.size());
+
+      ArrayType *t = ArrayType::get(I32, newDynamicLDS.size());
+      Constant *init = ConstantArray::get(t, newDynamicLDS);
+      GlobalVariable *table = new GlobalVariable(
+          M, t, true, GlobalValue::InternalLinkage, init,
+          "llvm.amdgcn.dynlds.offset.table", nullptr,
+          GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+      for (GlobalVariable *GV : DynamicVariables) {
+        for (Use &U : make_early_inc_range(GV->uses())) {
+          auto *I = dyn_cast<Instruction>(U.getUser());
+          if (!I)
+            continue;
+          if (isKernelLDS(I->getFunction()))
+            continue;
+
+          replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
+        }
+      }
+    }
+    return KernelToCreatedDynamicLDS;
+  }
+
+  static bool canElideModuleLDS(const Function &F) {
+    return F.hasFnAttribute("amdgpu-elide-module-lds");
+  }
+
+  static void markElideModuleLDS(Function &F) {
+    F.addFnAttr("amdgpu-elide-module-lds");
+  }
+
   bool runOnModule(Module &M) override {
     CallGraph CG = CallGraph(M);
     bool Changed = superAlignLDSGlobals(M);
@@ -948,12 +1053,15 @@ class AMDGPULowerModuleLDS : public ModulePass {
       }
     }
 
+    // Partition variables accessed indirectly into the 
diff erent strategies
     DenseSet<GlobalVariable *> ModuleScopeVariables;
     DenseSet<GlobalVariable *> TableLookupVariables;
     DenseSet<GlobalVariable *> KernelAccessVariables;
+    DenseSet<GlobalVariable *> DynamicVariables;
     partitionVariablesIntoIndirectStrategies(
         M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
-        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables);
+        ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
+        DynamicVariables);
 
     // If the kernel accesses a variable that is going to be stored in the
     // module instance through a call then that kernel needs to allocate the
@@ -965,6 +1073,10 @@ class AMDGPULowerModuleLDS : public ModulePass {
         kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
                                                         TableLookupVariables);
 
+    DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
+        kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+                                                        DynamicVariables);
+
     GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
         M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
 
@@ -990,14 +1102,15 @@ class AMDGPULowerModuleLDS : public ModulePass {
       });
     }
 
+    // The ith element of this vector is kernel id i
+    std::vector<Function *> OrderedKernels =
+        assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS,
+                                      KernelsThatIndirectlyAllocateDynamicLDS);
+
     if (!KernelsThatAllocateTableLDS.empty()) {
       LLVMContext &Ctx = M.getContext();
       IRBuilder<> Builder(Ctx);
 
-      // The ith element of this vector is kernel id i
-      std::vector<Function *> OrderedKernels =
-          assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS);
-
       for (size_t i = 0; i < OrderedKernels.size(); i++) {
         markUsedByKernel(Builder, OrderedKernels[i],
                          KernelToReplacement[OrderedKernels[i]].SGV);
@@ -1019,6 +1132,70 @@ class AMDGPULowerModuleLDS : public ModulePass {
                                                LookupTable);
     }
 
+    DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
+        lowerDynamicLDSVariables(M, LDSUsesInfo,
+                                 KernelsThatIndirectlyAllocateDynamicLDS,
+                                 DynamicVariables, OrderedKernels);
+
+    // All kernel frames have been allocated. Calculate and record the
+    // addresses.
+
+    {
+      const DataLayout &DL = M.getDataLayout();
+
+      for (Function &Func : M.functions()) {
+        if (Func.isDeclaration() || !isKernelLDS(&Func))
+          continue;
+
+        // All three of these are optional. The first variable is allocated at
+        // zero. They are allocated by allocateKnownAddressLDSGlobal in the
+        // following order:
+        //{
+        //  module.lds
+        //  alignment padding
+        //  kernel instance
+        //  alignment padding
+        //  dynamic lds variables
+        //}
+
+        const bool AllocateModuleScopeStruct =
+            MaybeModuleScopeStruct && !canElideModuleLDS(Func);
+
+        const bool AllocateKernelScopeStruct =
+            KernelToReplacement.contains(&Func);
+
+        const bool AllocateDynamicVariable =
+            KernelToCreatedDynamicLDS.contains(&Func);
+
+        uint32_t Offset = 0;
+
+        if (AllocateModuleScopeStruct) {
+          // Allocated at zero, recorded once on construction, not once per
+          // kernel
+          Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
+        }
+
+        if (AllocateKernelScopeStruct) {
+          GlobalVariable *KernelStruct = KernelToReplacement[&Func].SGV;
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
+
+          recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
+
+          Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+
+        }
+
+        if (AllocateDynamicVariable) {
+          GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
+
+          Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
+
+          recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
+        }
+      }
+    }
+
     for (auto &GV : make_early_inc_range(M.globals()))
       if (AMDGPU::isLDSVariableToLower(GV)) {
         // probably want to remove from used lists

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 6c6cc0127a2b9..5a12782de1a57 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -99,6 +99,15 @@ static const GlobalVariable *getKernelLDSGlobalFromFunction(const Function &F) {
   return M->getNamedGlobal(KernelLDSName);
 }
 
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
+  const Module *M = F.getParent();
+  std::string KernelDynLDSName = "llvm.amdgcn.";
+  KernelDynLDSName += F.getName();
+  KernelDynLDSName += ".dynlds";
+  return M->getNamedGlobal(KernelDynLDSName);
+}
+
 // This kernel calls no functions that require the module lds struct
 static bool canElideModuleLDS(const Function &F) {
   return F.hasFnAttribute("amdgpu-elide-module-lds");
@@ -131,6 +140,7 @@ void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
 
     const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
     const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+    const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
 
     if (GV && !canElideModuleLDS(F)) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
@@ -149,6 +159,19 @@ void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
         report_fatal_error("Inconsistent metadata on kernel LDS variable");
       }
     }
+
+    if (Dyn) {
+      // The dynamic LDS is deterministic because the per-kernel one has the
+      // maximum alignment of any reachable and all remaining LDS variables,
+      // if this is present, are themselves dynamic LDS and will be allocated
+      // at the same address.
+      setDynLDSAlign(F, *Dyn);
+      unsigned Offset = LDSSize;
+      std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+      if (!Expect || (Offset != *Expect)) {
+        report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+      }
+    }
   }
 }
 
@@ -187,8 +210,10 @@ AMDGPUMachineFunction::getLDSAbsoluteAddress(const GlobalValue &GV) {
   return {};
 }
 
-void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
                                            const GlobalVariable &GV) {
+  const Module *M = F.getParent();
+  const DataLayout &DL = M->getDataLayout();
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
 
   Align Alignment =
@@ -198,4 +223,17 @@ void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
 
   LDSSize = alignTo(StaticLDSSize, Alignment);
   DynLDSAlign = Alignment;
+
+  // If there is a dynamic LDS variable associated with this function F, every
+  // further dynamic LDS instance (allocated by calling setDynLDSAlign) must
+  // map to the same address. This holds because no LDS is allocated after the
+  // lowering pass if there are dynamic LDS variables present.
+  const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
+  if (Dyn) {
+    unsigned Offset = LDSSize; // return this?
+    std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+    if (!Expect || (Offset != *Expect)) {
+      report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+    }
+  }
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index ba4c55a58c866..cdb937fe7b24a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -111,7 +111,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
-  void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
+  void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
 };
 
 }

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ddadbae881194..23fb02c8f248f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6139,7 +6139,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
       if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
         assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
         // Adjust alignment for that dynamic shared memory array.
-        MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+        Function &F = DAG.getMachineFunction().getFunction();
+        MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
         return SDValue(
             DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
       }

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index b1418253fd136..bcf9d1e1a5c12 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -75,21 +75,25 @@ static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
   return Ret;
 }
 
-bool isLDSVariableToLower(const GlobalVariable &GV) {
+bool isDynamicLDS(const GlobalVariable &GV) {
+  // external zero size addrspace(3) without initializer implies cuda/hip extern
+  // __shared__ the semantics for such a variable appears to be that all extern
+  // __shared__ variables alias one another. This hits 
diff erent handling.
+  const Module *M = GV.getParent();
+  const DataLayout &DL = M->getDataLayout();
   if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
     return false;
   }
-  if (!GV.hasInitializer()) {
-    // addrspace(3) without initializer implies cuda/hip extern __shared__
-    // the semantics for such a variable appears to be that all extern
-    // __shared__ variables alias one another, in which case this transform
-    // is not required
+  uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+  return GV.hasExternalLinkage() && AllocSize == 0;
+}
+
+bool isLDSVariableToLower(const GlobalVariable &GV) {
+  if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
     return false;
   }
-  if (!isa<UndefValue>(GV.getInitializer())) {
-    // Initializers are unimplemented for LDS address space.
-    // Leave such variables in place for consistent error reporting.
-    return false;
+  if (isDynamicLDS(GV)) {
+    return true;
   }
   if (GV.isConstant()) {
     // A constant undef variable can't be written to, and any load is
@@ -97,6 +101,11 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
     // dropped by the back end if not. This pass skips over it.
     return false;
   }
+  if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
+    // Initializers are unimplemented for LDS address space.
+    // Leave such variables in place for consistent error reporting.
+    return false;
+  }
   return true;
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 92373fc14a986..a5ba5bd773455 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -28,6 +28,7 @@ namespace AMDGPU {
 
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
+bool isDynamicLDS(const GlobalVariable &GV);
 bool isLDSVariableToLower(const GlobalVariable &GV);
 std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M,
                                                       const Function *F);

diff  --git a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
index 4cce4bdb446fc..492d3ba7b0fcb 100644
--- a/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
+++ b/llvm/test/CodeGen/AMDGPU/hip.extern.shared.array.ll
@@ -135,25 +135,4 @@ define amdgpu_kernel void @dynamic_shared_array_6(i32 %idx) {
   ret void
 }
 
-; CHECK-LABEL: dynamic_shared_array_with_call:
-; CHECK-NOT: s_swappc_b64
-define amdgpu_kernel void @dynamic_shared_array_with_call(ptr addrspace(1) nocapture readnone %out) local_unnamed_addr {
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %1 = sext i32 %tid.x to i64
-  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds0, i64 0, i64 %1
-  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
-  tail call void @store_value(float %val0)
-  ret void
-}
-
-; CHECK-NOT: store_value
-define linkonce_odr hidden void @store_value(float %val1) local_unnamed_addr {
-entry:
-  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %0 = sext i32 %tid.x to i64
-  %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @dynamic_shared0, i64 0, i64 %0
-  store float %val1, ptr addrspace(3) %arrayidx1, align 4
-  ret void
-}
-
 declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index d8e3f52331ab4..5411e18018c35 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -34,7 +34,64 @@ define void @use_module() #0 {
 @extern_normal = external addrspace(3) global [0 x float]
 @extern_overalign = external addrspace(3) global [0 x float], align 8
 
-; 2^3 cases encoded into function names
+
+; External LDS does not influence the frame when called indirectly either
+define void @use_extern_normal() #0 {
+; CHECK-LABEL: use_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s4, s15
+; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x4048f5c3
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; CHECK-NEXT:    s_add_u32 s4, s4, s6
+; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    ds_write_b32 v1, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 0
+  store float 0x40091EB860000000, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_extern_overalign() #0 {
+; CHECK-LABEL: use_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt_vscnt null, 0x0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, llvm.amdgcn.dynlds.offset.table at rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, llvm.amdgcn.dynlds.offset.table at rel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s4, s15
+; CHECK-NEXT:    s_ashr_i32 s5, s15, 31
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x42280000
+; CHECK-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
+; CHECK-NEXT:    s_add_u32 s4, s4, s6
+; CHECK-NEXT:    s_addc_u32 s5, s5, s7
+; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    ds_write_b32 v1, v0 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %arrayidx = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 1
+  store float 4.200000e+01, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+
+; First 2^3 of 2^4 cases encoded into function names
+; no use of extern variable from nested function
+; module_variable used/not-used
+; kernel variable normal/overaligned
+; extern variable normal/overaligned
 
 define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 {
 ; CHECK-LABEL: module_0_kernel_normal_extern_normal:
@@ -268,5 +325,281 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
   ret void
 }
 
+
+;; Second 2^3 of 2^4 cases encoded into function names
+; with extern variable from nested function
+; module_variable used/not-used
+; kernel variable normal/overaligned
+; extern variable normal/overaligned
+
+define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_normal at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_normal at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 4
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_normal at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_normal at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 2
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_normal at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_normal at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 6
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_normal()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 1
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_overalign at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_overalign at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 5
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_normal
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) #1 {
+; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_extern_overalign at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_extern_overalign at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 2
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_mov_b32 s15, 3
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CHECK-NEXT:    s_endpgm
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
+; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_add_u32 s8, s8, s11
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s11
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_extern_overalign at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_extern_overalign at gotpcrel32@hi+12
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 2
+; CHECK-NEXT:    s_mov_b32 s15, 7
+; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    ds_write_b16 v0, v2 offset:4
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CHECK-NEXT:    s_endpgm
+  call void @use_module()
+  store i16 1, ptr addrspace(3) @module_variable
+
+  store i16 2, ptr addrspace(3) @kernel_overalign
+
+  call void @use_extern_overalign()
+  ret void
+}
+
+
 attributes #0 = { noinline }
 attributes #1 = { "amdgpu-elide-module-lds" }

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
index d55d39107c4d8..b5876204b3ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -18,10 +18,6 @@
 @const_undef = addrspace(3) constant i32 undef
 @const_with_init = addrspace(3) constant i64 8
 
-; External and constant are both left to the optimizer / error diagnostics
-; CHECK: @extern = external addrspace(3) global i32
- at extern = external addrspace(3) global i32
-
 ; Use of an addrspace(3) variable with an initializer is skipped,
 ; so as to preserve the unimplemented error from llc
 ; CHECK: @with_init = addrspace(3) global i64 0
@@ -47,14 +43,12 @@
 ; CHECK: %c0 = load i32, ptr addrspace(3) @const_undef, align 4
 ; CHECK: %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
 ; CHECK: %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-; CHECK: %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-; CHECK: %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+; CHECK: %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
 define void @use_variables() {
   %c0 = load i32, ptr addrspace(3) @const_undef, align 4
   %c1 = load i64, ptr addrspace(3) @const_with_init, align 4
   %v0 = atomicrmw add ptr addrspace(3) @with_init, i64 1 seq_cst
-  %v1 = cmpxchg ptr addrspace(3) @extern, i32 4, i32 %c0 acq_rel monotonic
-  %v2 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
+  %v1 = atomicrmw add ptr addrspace(4) @addr4, i64 %c1 monotonic
   ret void
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
new file mode 100644
index 0000000000000..d4f50e57970b9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+
+; Not reached by a non-kernel function and therefore not changed by this pass
+ at dynamic_kernel_only = external addrspace(3) global [0 x double]
+
+; shared8 is used directly by a kernel so remains in the outbout
+; the other three are only used by functions and will be replaced with
+; table lookups and dropped from the IR
+ at dynamic_shared1 = external addrspace(3) global [0 x i8], align 1
+ at dynamic_shared2 = external addrspace(3) global [0 x i16], align 2
+ at dynamic_shared4 = external addrspace(3) global [0 x i32], align 4
+ at dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { i32 }
+; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double]
+; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4, !absolute_symbol !0
+; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+
+; Alignment of these must be the maximum of the alignment of the reachable symbols
+; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0
+; CHECK: @llvm.amdgcn.expect_align2.dynlds = external addrspace(3) global [0 x i8], align 2, !absolute_symbol !0
+; CHECK: @llvm.amdgcn.expect_align4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1
+; CHECK: @llvm.amdgcn.expect_align8.dynlds = external addrspace(3) global [0 x i8], align 8, !absolute_symbol !0
+
+; Align 4 and symbol at address [4 5) as module.lds is reachable
+; CHECK: @llvm.amdgcn.expect_max_of_2_and_4.dynlds = external addrspace(3) global [0 x i8], align 4, !absolute_symbol !1
+
+; Builds a lookup table out of the newly created (suffixed .dynlds) variables in kernel.id order
+; CHECK: @llvm.amdgcn.dynlds.offset.table = internal addrspace(4) constant [5 x i32] [i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds to i32), i32 ptrtoint (ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds to i32)]
+
+
+
+define amdgpu_kernel void @kernel_only() {
+; CHECK-LABEL: @kernel_only() #0 {
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0
+; CHECK-NEXT:    store double 3.140000e+00, ptr addrspace(3) [[ARRAYIDX]], align 8
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x double], ptr addrspace(3) @dynamic_kernel_only, i32 0, i32 0
+  store double 3.140000e+00, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; The accesses from functions are rewritten to go through the llvm.amdgcn.dynlds.offset.table
+define void @use_shared1() #0 {
+; CHECK-LABEL: @use_shared1() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED1:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED1]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED11:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr addrspace(3) [[DYNAMIC_SHARED11]], i32 0, i32 1
+; CHECK-NEXT:    store i8 0, ptr addrspace(3) [[ARRAYIDX]], align 1
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i8], ptr addrspace(3) @dynamic_shared1, i32 0, i32 1
+  store i8 0, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_shared2() #0 {
+; CHECK-LABEL: @use_shared2() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED2:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED2]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED21:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i16], ptr addrspace(3) [[DYNAMIC_SHARED21]], i32 0, i32 3
+; CHECK-NEXT:    store i16 1, ptr addrspace(3) [[ARRAYIDX]], align 2
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i16], ptr addrspace(3) @dynamic_shared2, i32 0, i32 3
+  store i16 1, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; Include a normal variable so that the new variables aren't all at the same absolute_symbol
+ at static_shared = addrspace(3) global i32 undef
+define void @use_shared4() #0 {
+; CHECK-LABEL: @use_shared4() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    store i32 4, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED4:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED4]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED41:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr addrspace(3) [[DYNAMIC_SHARED41]], i32 0, i32 5
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 4, ptr addrspace(3) @static_shared
+  %arrayidx = getelementptr inbounds [0 x i32], ptr addrspace(3) @dynamic_shared4, i32 0, i32 5
+  store i32 2, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+define void @use_shared8() #0 {
+; CHECK-LABEL: @use_shared8() #1 {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT:    [[DYNAMIC_SHARED8:%.*]] = getelementptr inbounds [5 x i32], ptr addrspace(4) @llvm.amdgcn.dynlds.offset.table, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
+; CHECK-NEXT:    [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7
+  store i64 3, ptr addrspace(3) %arrayidx
+  ret void
+}
+
+; The kernels are annotated with kernel.id and llvm.donothing use of the corresponding variable
+define amdgpu_kernel void @expect_align1() {
+; CHECK-LABEL: @expect_align1() #0 !llvm.amdgcn.lds.kernel.id !2
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align1.dynlds) ]
+; CHECK-NEXT:    call void @use_shared1()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared1()
+  ret void
+}
+
+define amdgpu_kernel void @expect_align2() {
+; CHECK-LABEL: @expect_align2() #0 !llvm.amdgcn.lds.kernel.id !3
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align2.dynlds) ]
+; CHECK-NEXT:    call void @use_shared2()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared2()
+  ret void
+}
+
+define amdgpu_kernel void @expect_align4() {
+; CHECK-LABEL: @expect_align4() !llvm.amdgcn.lds.kernel.id !4 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align4.dynlds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @use_shared4()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared4()
+  ret void
+}
+
+; Use dynamic_shared directly too. Can elide module lds (#0) 
+define amdgpu_kernel void @expect_align8() {
+; CHECK-LABEL: @expect_align8() #0 !llvm.amdgcn.lds.kernel.id !5 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    call void @use_shared8()
+; CHECK-NEXT:    ret void
+;
+  %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
+  store i64 3, ptr addrspace(3) %arrayidx
+  call void @use_shared8()
+  ret void
+}
+
+; Note: use_shared4 uses module.lds so this will allocate at offset 4
+define amdgpu_kernel void @expect_max_of_2_and_4() {
+; CHECK-LABEL: @expect_max_of_2_and_4() !llvm.amdgcn.lds.kernel.id !6 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_max_of_2_and_4.dynlds) ]
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
+; CHECK-NEXT:    call void @use_shared2()
+; CHECK-NEXT:    call void @use_shared4()
+; CHECK-NEXT:    ret void
+;
+  call void @use_shared2()
+  call void @use_shared4()
+  ret void
+}
+
+
+attributes #0 = { noinline }
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+; CHECK: declare void @llvm.donothing() #2
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: declare i32 @llvm.amdgcn.lds.kernel.id() #3
+
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds" }
+; CHECK: attributes #1 = { noinline }
+; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !1 = !{i64 4, i64 5}
+; CHECK: !2 = !{i32 0}
+; CHECK: !3 = !{i32 1}
+; CHECK: !4 = !{i32 2}
+; CHECK: !5 = !{i32 3}
+; CHECK: !6 = !{i32 4}