[llvm] [AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS (PR #89683)

Tue Apr 23 23:42:53 PDT 2024

https://github.com/gandhi56 updated https://github.com/llvm/llvm-project/pull/89683

>From 032f3d9f0a3576b428e648f794d32db66f8778e8 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <Anshil.Gandhi at amd.com>
Date: Thu, 15 Feb 2024 22:09:41 +0000
Subject: [PATCH] [AMDGPU][LTO] Introduce AMDGPUCloneModuleLDS

The purpose of this pass is to ensure that the
combined module contains as many LDS global variables
as there are kernels that (indirectly) access them.
As LDS variables behave like C++ static variables,
it is important that each partition contains a
unique copy of the variable on a per kernel basis.
This representation also prepares the combined
module to eliminate cross-module dependencies of
LDS variables.

This pass operates as follows:
1. Firstly, traverse the call graph from each kernel
   to determine the number of kernels calling each
   device function.
2. For each LDS global variable GV, determine the
   function F that defines it. Collect it's caller
   functions. Clone F and GV, and finally insert a
   call/invoke instruction in each caller function.

Change-Id: I998291a389ea3db10de9122f08fe55c981da6049
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   5 +
 .../Target/AMDGPU/AMDGPUCloneModuleLDS.cpp    | 139 ++++++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   1 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../llvm-split/AMDGPU/clone-lds-function.ll   |  58 ++++++++
 .../clone-lds-functions-ancestor-kernels.ll   | 106 +++++++++++++
 .../AMDGPU/clone-lds-functions-successors.ll  | 132 +++++++++++++++++
 .../AMDGPU/clone-lds-struct-insts.ll          | 100 +++++++++++++
 9 files changed, 543 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll
 create mode 100644 llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 6016bd5187d887..f913833a25119b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -149,6 +149,11 @@ struct AMDGPULowerBufferFatPointersPass
   const TargetMachine &TM;
 };
 
+struct AMDGPUCloneModuleLDSPass
+    : public PassInfoMixin<AMDGPUCloneModuleLDSPass> {
+  PreservedAnalyses run(Module &, ModuleAnalysisManager &);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp
new file mode 100644
index 00000000000000..9205e62a9a3d78
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCloneModuleLDS.cpp
@@ -0,0 +1,139 @@
+//===-- AMDGPUCloneModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to ensure that the combined module contains
+// as many LDS global variables as there are kernels that (indirectly) access
+// them. As LDS variables behave like C++ static variables, it is important that
+// each partition contains a unique copy of the variable on a per kernel basis.
+// This representation also prepares the combined module to eliminate
+// cross-module dependencies of LDS variables.
+//
+// This pass operates as follows:
+// 1. Firstly, traverse the call graph from each kernel to determine the number
+//    of kernels calling each device function.
+// 2. For each LDS global variable GV, determine the function F that defines it.
+//    Collect it's caller functions. Clone F and GV, and finally insert a
+//    call/invoke instruction in each caller function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-clone-module-lds"
+
+static cl::opt<unsigned int> MaxCountForClonedFunctions(
+    "clone-lds-functions-max-count", cl::init(16), cl::Hidden,
+    cl::desc("Specify a limit to the number of clones of a function"));
+
+/// Return the function that defines \p GV
+/// \param GV The global variable in question
+/// \return The function defining \p GV
+static Function *getFunctionDefiningGV(GlobalVariable &GV) {
+  SmallVector<User *> Worklist(GV.users());
+  while (!Worklist.empty()) {
+    User *U = Worklist.pop_back_val();
+    if (auto *Inst = dyn_cast<Instruction>(U))
+      return Inst->getFunction();
+    if (auto *Op = dyn_cast<Operator>(U))
+      append_range(Worklist, Op->users());
+  }
+  return nullptr;
+};
+
+PreservedAnalyses AMDGPUCloneModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (MaxCountForClonedFunctions.getValue() == 1)
+    return PreservedAnalyses::all();
+
+  bool Changed = false;
+  auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+  // For each function in the call graph, determine the number
+  // of ancestor-caller kernels.
+  DenseMap<Function *, unsigned int> KernelRefsToFuncs;
+  for (auto &Fn : M) {
+    if (Fn.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+      continue;
+    for (auto I = df_begin(&CG), E = df_end(&CG); I != E; ++I)
+      if (auto *F = I->getFunction())
+        KernelRefsToFuncs[F]++;
+  }
+
+  DenseMap<GlobalVariable *, Function *> GVToFnMap;
+  for (auto &GV : M.globals()) {
+    if (GVToFnMap.contains(&GV) ||
+        GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+        !GV.hasInitializer())
+      continue;
+
+    auto *OldF = getFunctionDefiningGV(GV);
+    GVToFnMap.insert({&GV, OldF});
+    LLVM_DEBUG(dbgs() << "Found LDS " << GV.getName() << " used in function "
+                      << OldF->getName() << '\n');
+
+    // Collect all call instructions to OldF
+    SmallVector<Instruction *> InstsCallingOldF;
+    for (auto &I : OldF->uses())
+      if (auto *CI = dyn_cast<CallBase>(I.getUser()))
+        InstsCallingOldF.push_back(CI);
+
+    // Create as many clones of the function containing LDS global as
+    // there are kernels calling the function (including the function
+    // already defining the LDS global). Respectively, clone the
+    // LDS global and the call instructions to the function.
+    LLVM_DEBUG(dbgs() << "\tFunction is referenced by "
+                      << KernelRefsToFuncs[OldF] << " kernels.\n");
+    for (unsigned int ID = 0;
+         ID + 1 < std::min(KernelRefsToFuncs[OldF],
+                           MaxCountForClonedFunctions.getValue());
+         ++ID) {
+      // Clone LDS global variable
+      auto *NewGV = new GlobalVariable(
+          M, GV.getValueType(), GV.isConstant(), GlobalValue::InternalLinkage,
+          PoisonValue::get(GV.getValueType()),
+          GV.getName() + ".clone." + Twine(ID), &GV,
+          GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS, false);
+      NewGV->copyAttributesFrom(&GV);
+      NewGV->copyMetadata(&GV, 0);
+      NewGV->setComdat(GV.getComdat());
+      LLVM_DEBUG(dbgs() << "Inserting LDS clone with name " << NewGV->getName()
+                        << '\n');
+      
+      // Clone function
+      ValueToValueMapTy VMap;
+      VMap[&GV] = NewGV;
+      auto *NewF = CloneFunction(OldF, VMap);
+      NewF->setName(OldF->getName() + ".clone." + Twine(ID));
+      LLVM_DEBUG(dbgs() << "Inserting function clone with name "
+                        << NewF->getName() << '\n');
+
+
+      // Create a new CallInst to call the cloned function
+      for (auto *Inst : InstsCallingOldF) {
+        Instruction *I = Inst->clone();
+        I->setName(Inst->getName() + ".clone." + Twine(ID));
+        if (auto *CI = dyn_cast<CallBase>(I))
+          CI->setCalledOperand(NewF);
+        I->insertAfter(Inst);
+        LLVM_DEBUG(dbgs() << "Inserting inst: " << *I << '\n');
+      }
+      Changed = true;
+    }
+  }
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 90f36fadf35903..eb4bf25fef628a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -22,6 +22,7 @@ MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
+MODULE_PASS("amdgpu-clone-module-lds", AMDGPUCloneModuleLDSPass())
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 305a6c8c3b9262..09beabd3f9c55c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -725,6 +725,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
         // We want to support the -lto-partitions=N option as "best effort".
         // For that, we need to lower LDS earlier in the pipeline before the
         // module is partitioned for codegen.
+        PM.addPass(AMDGPUCloneModuleLDSPass());
         if (EnableLowerModuleLDS)
           PM.addPass(AMDGPULowerModuleLDSPass(*this));
       });
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 48325a0928f93d..fbf59e0422cb79 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUAtomicOptimizer.cpp
   AMDGPUAttributor.cpp
   AMDGPUCallLowering.cpp
+  AMDGPUCloneModuleLDS.cpp
   AMDGPUCodeGenPrepare.cpp
   AMDGPUCombinerHelper.cpp
   AMDGPUCtorDtorLowering.cpp
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll
new file mode 100644
index 00000000000000..28060bc27da297
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-function.ll
@@ -0,0 +1,58 @@
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; In this examples, CloneModuleLDS pass creates two copies of LDS_GV
+; as two kernels call the same device function where LDS_GV is used.
+
+; CHECK: [[LDS_GV_CLONE:@.*\.clone\.0]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[LDS_GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+ at lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]])
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n)
+  ret void
+}
+
+
+define i32 @lds_func(i32 %x) {
+; CHECK-LABEL: define i32 @lds_func(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    store i32 [[X]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  store i32 %x, ptr %p
+  ret i32 %x
+}
+
+; CHECK-LABEL: define i32 @lds_func.clone.0(i32 %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[LDS_GV_CLONE]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   store i32 %x, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %x
+
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll
new file mode 100644
index 00000000000000..eb428a2743efbc
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-ancestor-kernels.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; Before transformation,                    After transformation,
+;  K1  K2                                    K1  K2
+;  |  /                                      |  /
+;  | /                                       | /
+;  A                         ==>             A
+;  | \                                       | \
+;  |  \                                      |  \
+;  B   C                                     B   C
+;  |                                         | \
+;  X                                         X1 X2
+;
+; where X contains an LDS reference
+
+; CHECK: [[GV_CLONE:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+ at lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define void @A() {
+; CHECK-LABEL: define void @A() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @B()
+; CHECK-NEXT:    call void @C()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @B()
+  call void @C()
+  ret void
+}
+
+define i32 @B() {
+; CHECK-LABEL: define i32 @B() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @X(ptr [[P]])
+; CHECK-NEXT:    [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %p = alloca i32
+  store i32 5, ptr %p
+  %ret = call i32 @X(ptr %p)
+  ret i32 %ret
+}
+
+define void @C() {
+; CHECK-LABEL: define void @C() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+}
+
+define i32 @X(ptr %x) {
+; CHECK-LABEL: define i32 @X(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  %v = load i32, ptr %x
+  store i32 %v, ptr %p
+  ret i32 %v
+}
+
+; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   %v = load i32, ptr %x, align 4
+; CHECK-NEXT:   store i32 %v, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %v
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll
new file mode 100644
index 00000000000000..d3e3e202ea59da
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-functions-successors.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+; Before transformation,                    After transformation,
+;  K1  K2    K3                              K1  K2    K3
+;  |  /      |                               |  /      |
+;  | /       |                               | /       |
+;  A --------+               ==>             A --------+
+;  |                                         |
+;  |                                         |
+;  B                                         B
+;  |                                       / | \
+;  X                                      X1 X2 X3
+;  |                                      \  |  /
+;  D                                       \ | /
+;                                            D
+; where X contains an LDS reference
+
+; CHECK: [[GV_CLONE_0:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[GV_CLONE_1:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 16
+; CHECK: [[GV:@.*]] = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+ at lds_gv = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 16
+
+define protected amdgpu_kernel void @kernel1(i32 %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel3(i32 %n) {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel3(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @A(i32 [[N]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @A(i32 %n)
+  ret void
+}
+
+define void @A() {
+; CHECK-LABEL: define void @A() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @B()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @B()
+  ret void
+}
+
+define i32 @B() {
+; CHECK-LABEL: define i32 @B() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 5, ptr [[P]], align 4
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @X(ptr [[P]])
+; CHECK-NEXT:    [[RET_CLONE_1:%.*]] = call i32 @X.clone.1(ptr [[P]])
+; CHECK-NEXT:    [[RET_CLONE_0:%.*]] = call i32 @X.clone.0(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %p = alloca i32
+  store i32 5, ptr %p
+  %ret = call i32 @X(ptr %p)
+  ret i32 %ret
+}
+
+define i32 @X(ptr %x) {
+; CHECK-LABEL: define i32 @X(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 0, i64 0
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    call void @D(ptr [[P]])
+; CHECK-NEXT:    store i32 [[V]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[V]]
+;
+entry:
+  %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 0, i64 0
+  %v = load i32, ptr %x
+  call void @D(ptr %p)
+  store i32 %v, ptr %p
+  ret i32 %v
+}
+
+define void @D(ptr %x) {
+; CHECK-LABEL: define void @D(ptr %x) {
+; CHECK-NEXT:   entry:
+; CHECK-NEXT:     store i32 8, ptr %x, align 4
+; CHECK-NEXT:     ret void
+entry:
+  store i32 8, ptr %x
+  ret void
+}
+
+; CHECK-LABEL: define i32 @X.clone.0(ptr %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_0]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   %v = load i32, ptr %x, align 4
+; CHECK-NEXT:   call void @D(ptr [[P]])
+; CHECK-NEXT:   store i32 %v, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %v
+
+; CHECK-LABEL: define i32 @X.clone.1(ptr %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %p = getelementptr inbounds [64 x i32], ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_1]] to ptr), i64 0, i64 0
+; CHECK-NEXT:   %v = load i32, ptr %x, align 4
+; CHECK-NEXT:   call void @D(ptr [[P]])
+; CHECK-NEXT:   store i32 %v, ptr %p, align 4
+; CHECK-NEXT:   ret i32 %v
diff --git a/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll
new file mode 100644
index 00000000000000..c398119ed08cba
--- /dev/null
+++ b/llvm/test/tools/llvm-split/AMDGPU/clone-lds-struct-insts.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=amdgpu-clone-module-lds %s -S | FileCheck %s
+
+target triple = "amdgcn-amd-amdhsa"
+
+%struct.RT = type { i8, [10 x [20 x i32]], i8 }
+%struct.GV = type { i32, double, %struct.RT }
+
+; CHECK: [[GV_CLONE_0:@.*]] = internal addrspace(3) global %struct.GV poison, align 8
+; CHECK: [[GV:@.*]] = internal addrspace(3) global %struct.GV zeroinitializer, align 8
+ at lds_gv = internal addrspace(3) global %struct.GV zeroinitializer, align 8
+
+define protected amdgpu_kernel void @kernel1(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel1(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]], i1 false)
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]], i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n, i1 0)
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel2(i32 %n) #3 {
+; CHECK-LABEL: define protected amdgpu_kernel void @kernel2(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @lds_func(i32 [[N]], i1 true)
+; CHECK-NEXT:    [[CALL_CLONE_0:%.*]] = call i32 @lds_func.clone.0(i32 [[N]], i1 true)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %call = call i32 @lds_func(i32 %n, i1 1)
+  ret void
+}
+
+define i32 @lds_func(i32 %x, i1 %cond) {
+; CHECK-LABEL: define i32 @lds_func(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[TMP_0:%.*]] = alloca %struct.GV, align 8, addrspace(3)
+; CHECK-NEXT:   %p = getelementptr inbounds [[STRUCT_GV:%.*]], ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 13
+; CHECK-NEXT:   store i32 %x, ptr addrspace(3) %p, align 4
+; CHECK-NEXT:   store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 12), align 4
+; CHECK-NEXT:   store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 1), align 4
+; CHECK-NEXT:   %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) [[GV]] to ptr), i64 6), align 1
+; CHECK-NEXT:   br i1 %cond, label %bb.1, label %bb.2
+; CHECK:      bb.1:                                             ; preds = %entry
+; CHECK-NEXT:   br label %sink
+; CHECK:      bb.2:                                             ; preds = %entry
+; CHECK-NEXT:   br label %sink
+; CHECK:      sink:                                             ; preds = %bb.2, %bb.1
+; CHECK-NEXT:   %val = phi ptr addrspace(3) [ [[TMP_0]], %bb.1 ], [ [[GV]], %bb.2 ]
+; CHECK-NEXT:   %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) [[GV]], i64 1, i32 2, i32 1, i64 5, i64 1
+; CHECK-NEXT:   %retval = load i32, ptr addrspace(3) %p.0, align 4
+; CHECK-NEXT:   ret i32 %retval
+;
+entry:
+  %tmp.GV = alloca %struct.GV, addrspace(3)
+  %p = getelementptr inbounds %struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 13
+  store i32 %x, ptr addrspace(3) %p
+  store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 12)
+  store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 1)
+  %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) @lds_gv to ptr), i64 6), align 1
+  br i1 %cond, label %bb.1, label %bb.2
+
+bb.1:
+  br label %sink
+
+bb.2:
+  br label %sink
+
+sink:
+  %val = phi ptr addrspace(3) [%tmp.GV, %bb.1], [@lds_gv, %bb.2]
+  %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) @lds_gv, i64 1, i32 2, i32 1, i64 5, i64 1
+  %retval = load i32, ptr addrspace(3) %p.0
+  ret i32 %retval
+}
+
+; CHECK-LABEL: define i32 @lds_func.clone.0(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[TMP_0]] = alloca %struct.GV, align 8, addrspace(3)
+; CHECK-NEXT:   %p = getelementptr inbounds [[STRUCT_GV:%.*]], ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 13
+; CHECK-NEXT:   store i32 %x, ptr addrspace(3) %p, align 4
+; CHECK-NEXT:   store i32 %x, ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 12), align 4
+; CHECK-NEXT:   store ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 11), ptr addrspace(3) getelementptr inbounds (%struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 1), align 4
+; CHECK-NEXT:   %gep.ascast = load i8, ptr getelementptr inbounds (%struct.GV, ptr addrspacecast (ptr addrspace(3) [[GV_CLONE_0]] to ptr), i64 6), align 1
+; CHECK-NEXT:   br i1 %cond, label %bb.1, label %bb.2
+; CHECK:      bb.1:                                             ; preds = %entry
+; CHECK-NEXT:   br label %sink
+; CHECK:      bb.2:                                             ; preds = %entry
+; CHECK-NEXT:   br label %sink
+; CHECK:      sink:                                             ; preds = %bb.2, %bb.1
+; CHECK-NEXT:   %val = phi ptr addrspace(3) [ [[TMP_0]], %bb.1 ], [ [[GV_CLONE_0]], %bb.2 ]
+; CHECK-NEXT:   %p.0 = getelementptr inbounds %struct.GV, ptr addrspace(3) [[GV_CLONE_0]], i64 1, i32 2, i32 1, i64 5, i64 1
+; CHECK-NEXT:   %retval = load i32, ptr addrspace(3) %p.0, align 4
+; CHECK-NEXT:   ret i32 %retval