[llvm] [DebugInfo][NVPTX] Adding support for `inlined_at` debug directive in NVPTX backend (PR #170239)

Laxman Sole via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 11 18:55:02 PST 2025


https://github.com/laxmansole updated https://github.com/llvm/llvm-project/pull/170239

>From cd4de2f207bb8d16f68a9a5ab91f9339586ebdd0 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Wed, 26 Nov 2025 15:55:11 -0800
Subject: [PATCH 1/3] Adding support for inlined_at debug directive in NVPTX
 backend

---
 llvm/include/llvm/CodeGen/AsmPrinter.h        |   4 +
 llvm/include/llvm/MC/MCStreamer.h             |   8 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |   4 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  19 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |  21 +-
 .../CodeGen/AsmPrinter/DwarfStringPool.cpp    |   7 +-
 llvm/lib/MC/MCAsmStreamer.cpp                 | 123 +++++--
 llvm/lib/Target/NVPTX/CMakeLists.txt          |   3 +-
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |   5 +
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h       |   3 +
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp     | 165 +++++++++
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h       |  52 +++
 llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll      | 144 ++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll      |  85 +++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll      | 305 ++++++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll      | 229 ++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll      | 208 +++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll      | 343 ++++++++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll      | 118 ++++++
 19 files changed, 1801 insertions(+), 45 deletions(-)
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 7f99e81e7d1a0..fcfe0cebebb43 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -269,6 +269,10 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass {
   AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer,
              char &ID = AsmPrinter::ID);
 
+  /// Create the DwarfDebug handler. Targets can override this to provide
+  /// custom debug information handling.
+  virtual DwarfDebug *createDwarfDebug();
+
 public:
   ~AsmPrinter() override;
 
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 79c715e3820a6..6fe858c3201eb 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -902,6 +902,14 @@ class LLVM_ABI MCStreamer {
                                      StringRef FileName,
                                      StringRef Comment = {});
 
+  /// This is same as emitDwarfLocDirective, except has capability to
+  /// add inlined_at information.
+  virtual void emitDwarfLocDirectiveWithInlinedAt(
+      unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+      unsigned LineIA, unsigned ColumnIA, const MCSymbol *Sym, unsigned Flags,
+      unsigned Isa, unsigned Discriminator, StringRef FileName,
+      StringRef Comment = {}) {}
+
   /// This implements the '.loc_label Name' directive.
   virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 965f7f59ce9a9..61784b9f85a28 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -470,6 +470,8 @@ const MCSection *AsmPrinter::getCurrentSection() const {
   return OutStreamer->getCurrentSectionOnly();
 }
 
+DwarfDebug *AsmPrinter::createDwarfDebug() { return new DwarfDebug(this); }
+
 void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -590,7 +592,7 @@ bool AsmPrinter::doInitialization(Module &M) {
       Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
-        DD = new DwarfDebug(this);
+        DD = createDwarfDebug();
         Handlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 40bfea059c707..3fd60d70cfb68 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2204,13 +2204,27 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       Flags |= DWARF2_FLAG_IS_STMT;
   }
 
-  RecordSourceLine(DL, Flags);
+  // Call the hook that allows targets to customize source line recording
+  recordSourceLineHook(*MI, DL, Flags);
 
   // If we're not at line 0, remember this location.
   if (DL.getLine())
     PrevInstLoc = DL;
 }
 
+// Default implementation of target-specific hook for custom source line
+// recording
+void DwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+                                      const DebugLoc &DL, unsigned Flags) {
+  SmallString<128> LocationString;
+  if (Asm->OutStreamer->isVerboseAsm()) {
+    raw_svector_ostream OS(LocationString);
+    DL.print(OS);
+  }
+  recordSourceLine(DL.getLine(), DL.getCol(), DL.getScope(), Flags,
+                   LocationString);
+}
+
 // Returns the position where we should place prologue_end, potentially nullptr,
 // which means "no good place to put prologue_end". Returns true in the second
 // return value if there are no setup instructions in this function at all,
@@ -2705,6 +2719,9 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(
       getDwarfCompileUnitIDForLineTable(CU));
 
+  // Call target-specific hook for custom initialization
+  beginFunctionHook(*MF);
+
   // Record beginning of function.
   PrologEndLoc = emitInitialLocDirective(
       *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1a1b28a6fc035..58aeb09645984 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -394,9 +394,6 @@ class DwarfDebug : public DebugHandlerBase {
   /// table for the same directory as DW_AT_comp_dir.
   StringRef CompilationDir;
 
-  /// Holder for the file specific debug information.
-  DwarfFile InfoHolder;
-
   /// Holders for the various debug information flags that we might need to
   /// have exposed. See accessor functions below for description.
 
@@ -532,10 +529,6 @@ class DwarfDebug : public DebugHandlerBase {
 
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
-  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
-    return InfoHolder.getUnits();
-  }
-
   using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
 
   void ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
@@ -711,6 +704,8 @@ class DwarfDebug : public DebugHandlerBase {
   void computeKeyInstructions(const MachineFunction *MF);
 
 protected:
+  /// Holder for the file specific debug information.
+  DwarfFile InfoHolder;
   /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
 
@@ -722,6 +717,18 @@ class DwarfDebug : public DebugHandlerBase {
 
   void skippedNonDebugFunction() override;
 
+  /// Target-specific hook for custom initialization,
+  /// default implementation is empty, only being used for NVPTX target
+  virtual void beginFunctionHook(const MachineFunction &MF) {}
+
+  /// Target-specific hook for custom source line recording
+  virtual void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+                                    unsigned Flags);
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return InfoHolder.getUnits();
+  }
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index d304c7efe2a75..4ccad65bdba91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -96,9 +96,12 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
     if (ShouldCreateSymbols)
       Asm.OutStreamer->emitLabel(Entry->getValue().Symbol);
 
+    // Emit a comment with the string offset and the string itself.
+    Asm.OutStreamer->AddComment(
+        "string offset=" + Twine(Entry->getValue().Offset) + " ; " +
+        StringRef(Entry->getKeyData(), Entry->getKeyLength()));
+
     // Emit the string itself with a terminating null byte.
-    Asm.OutStreamer->AddComment("string offset=" +
-                                Twine(Entry->getValue().Offset));
     Asm.OutStreamer->emitBytes(
         StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
   }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index e2543058394a2..bdfaff0a22417 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -70,6 +70,17 @@ class MCAsmStreamer final : public MCStreamer {
   void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
 
+  /// Helper to emit common .loc directive flags, isa, and discriminator
+  void emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+                                  unsigned Discriminator);
+
+  /// Helper to emit the common suffix of .loc directives (flags, comment, EOL,
+  /// parent call)
+  void emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+                                   unsigned Column, unsigned Flags,
+                                   unsigned Isa, unsigned Discriminator,
+                                   StringRef FileName, StringRef Comment);
+
 public:
   MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
                 std::unique_ptr<MCInstPrinter> printer,
@@ -297,6 +308,14 @@ class MCAsmStreamer final : public MCStreamer {
                              StringRef Location = {}) override;
   void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
 
+  void emitDwarfLocDirectiveWithInlinedAt(unsigned FileNo, unsigned Line,
+                                          unsigned Column, unsigned FileIA,
+                                          unsigned LineIA, unsigned ColIA,
+                                          const MCSymbol *Sym, unsigned Flags,
+                                          unsigned Isa, unsigned Discriminator,
+                                          StringRef FileName,
+                                          StringRef Comment = {}) override;
+
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
   bool emitCVFileDirective(unsigned FileNo, StringRef Filename,
@@ -1675,6 +1694,57 @@ void MCAsmStreamer::emitDwarfFile0Directive(
     emitRawText(OS1.str());
 }
 
+/// Helper to emit common .loc directive flags, isa, and discriminator
+void MCAsmStreamer::emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+                                               unsigned Discriminator) {
+  if (!MAI->supportsExtendedDwarfLocDirective())
+    return;
+
+  if (Flags & DWARF2_FLAG_BASIC_BLOCK)
+    OS << " basic_block";
+  if (Flags & DWARF2_FLAG_PROLOGUE_END)
+    OS << " prologue_end";
+  if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
+    OS << " epilogue_begin";
+
+  const unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
+  if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
+    OS << " is_stmt ";
+    OS << ((Flags & DWARF2_FLAG_IS_STMT) ? "1" : "0");
+  }
+
+  if (Isa)
+    OS << " isa " << Isa;
+  if (Discriminator)
+    OS << " discriminator " << Discriminator;
+}
+
+/// Helper to emit the common suffix of .loc directives
+void MCAsmStreamer::emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+                                                unsigned Column, unsigned Flags,
+                                                unsigned Isa,
+                                                unsigned Discriminator,
+                                                StringRef FileName,
+                                                StringRef Comment) {
+  // Emit flags, isa, and discriminator
+  emitDwarfLocDirectiveFlags(Flags, Isa, Discriminator);
+
+  // Emit verbose comment if enabled
+  if (IsVerboseAsm) {
+    OS.PadToColumn(MAI->getCommentColumn());
+    OS << MAI->getCommentString() << ' ';
+    if (Comment.empty())
+      OS << FileName << ':' << Line << ':' << Column;
+    else
+      OS << Comment;
+  }
+
+  // Emit end of line and update parent state
+  EmitEOL();
+  MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
+                                    Discriminator, FileName, Comment);
+}
+
 void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
                                           unsigned Isa, unsigned Discriminator,
@@ -1691,42 +1761,29 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
     return;
   }
 
+  // Emit the basic .loc directive
   OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
-  if (MAI->supportsExtendedDwarfLocDirective()) {
-    if (Flags & DWARF2_FLAG_BASIC_BLOCK)
-      OS << " basic_block";
-    if (Flags & DWARF2_FLAG_PROLOGUE_END)
-      OS << " prologue_end";
-    if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
-      OS << " epilogue_begin";
-
-    unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
-    if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
-      OS << " is_stmt ";
-
-      if (Flags & DWARF2_FLAG_IS_STMT)
-        OS << "1";
-      else
-        OS << "0";
-    }
 
-    if (Isa)
-      OS << " isa " << Isa;
-    if (Discriminator)
-      OS << " discriminator " << Discriminator;
-  }
+  // Emit common suffix (flags, comment, EOL, parent call)
+  emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+                              FileName, Comment);
+}
 
-  if (IsVerboseAsm) {
-    OS.PadToColumn(MAI->getCommentColumn());
-    OS << MAI->getCommentString() << ' ';
-    if (Comment.empty())
-      OS << FileName << ':' << Line << ':' << Column;
-    else
-      OS << Comment;
-  }
-  EmitEOL();
-  this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
-                                          Discriminator, FileName, Comment);
+/// This is same as emitDwarfLocDirective, except also emits inlined function
+/// and inlined callsite information.
+void MCAsmStreamer::emitDwarfLocDirectiveWithInlinedAt(
+    unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+    unsigned LineIA, unsigned ColIA, const MCSymbol *Sym, unsigned Flags,
+    unsigned Isa, unsigned Discriminator, StringRef FileName,
+    StringRef Comment) {
+  // Emit the basic .loc directive with NVPTX-specific extensions
+  OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
+  OS << ", function_name " << *Sym;
+  OS << ", inlined_at " << FileIA << " " << LineIA << " " << ColIA;
+
+  // Emit common suffix (flags, comment, EOL, parent call)
+  emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+                              FileName, Comment);
 }
 
 void MCAsmStreamer::emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) {
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 6fe58c25c757d..505e7e945f108 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,12 +18,13 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
-  NVPTXIRPeephole.cpp
+  NVPTXDwarfDebug.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
   NVPTXImageOptimizer.cpp
   NVPTXInstrInfo.cpp
+  NVPTXIRPeephole.cpp
   NVPTXISelDAGToDAG.cpp
   NVPTXISelLowering.cpp
   NVPTXLowerAggrCopies.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9bbb3aad89c44..d5c9988d0eb53 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "MCTargetDesc/NVPTXTargetStreamer.h"
 #include "NVPTX.h"
+#include "NVPTXDwarfDebug.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
@@ -676,6 +677,10 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
   OutStreamer->emitRawText(OS1.str());
 }
 
+DwarfDebug *NVPTXAsmPrinter::createDwarfDebug() {
+  return new NVPTXDwarfDebug(this);
+}
+
 bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index f35931868d99f..20c1bbb9e15b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -194,6 +194,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
   bool doInitialization(Module &M) override;
   bool doFinalization(Module &M) override;
 
+  // Create NVPTX-specific DwarfDebug handler
+  DwarfDebug *createDwarfDebug() override;
+
 private:
   bool GlobalsEmitted;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
new file mode 100644
index 0000000000000..57410179a6344
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -0,0 +1,165 @@
+//===-- NVPTXDwarfDebug.cpp - NVPTX DwarfDebug Implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for NVPTX-specific debug information
+// processing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXDwarfDebug.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// Command line option to control inlined_at enhancement to lineinfo support.
+// Valid only when debuginfo emissionkind is DebugDirectivesOnly or
+// LineTablesOnly.
+static cl::opt<bool> LineInfoWithInlinedAt(
+    "line-info-inlined-at",
+    cl::desc("Emit line with inlined_at enhancement for NVPTX"), cl::init(true),
+    cl::Hidden);
+
+NVPTXDwarfDebug::NVPTXDwarfDebug(AsmPrinter *A) : DwarfDebug(A) {}
+
+// Collect all inlined_at locations for the current function.
+void NVPTXDwarfDebug::collectInlinedAtLocations(const MachineFunction &MF) {
+  const DISubprogram *SP = MF.getFunction().getSubprogram();
+  assert(SP && "expecting valid subprogram here");
+
+  // inlined_at support requires PTX 7.2 or later.
+  const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
+  if (STI.getPTXVersion() < 72)
+    return;
+
+  if (!(SP->getUnit()->isDebugDirectivesOnly() ||
+        SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly) ||
+      !LineInfoWithInlinedAt) // No enhanced lineinfo, we are done.
+    return;
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      const DebugLoc &DL = MI.getDebugLoc();
+      if (!DL)
+        continue;
+      const DILocation *InlinedAt = DL.getInlinedAt();
+      while (InlinedAt) {
+        if (!InlinedAtLocs.insert(InlinedAt).second)
+          break;
+        InlinedAt = InlinedAt->getInlinedAt();
+      }
+    }
+  }
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
+                                                   unsigned Flags) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  // Maintain a work list of .loc to be emitted. If we are emitting the
+  // inlined_at directive, we might need to emit additional .loc prior
+  // to it for the location contained in the inlined_at.
+  SmallVector<const DILocation *, 8> WorkList;
+  DenseSet<const DILocation *> WorkListSet;
+  const DILocation *EmitLoc = DL.get();
+
+  const DISubprogram *SP = MI.getMF()->getFunction().getSubprogram();
+  const NVPTXSubtarget &STI = MI.getMF()->getSubtarget<NVPTXSubtarget>();
+  const bool EnhancedLineinfo =
+      LineInfoWithInlinedAt && (STI.getPTXVersion() >= 72) && SP &&
+      (SP->getUnit()->isDebugDirectivesOnly() ||
+       SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly);
+
+  while (EmitLoc) {
+    // Get the scope for the current location.
+    const DIScope *Scope = EmitLoc->getScope();
+    if (!Scope)
+      break; // scope is null, we are done.
+
+    // Check if this loc is already in work list, if so, we are done.
+    if (WorkListSet.contains(EmitLoc))
+      break;
+
+    // Add this location to the work list.
+    WorkList.push_back(EmitLoc);
+    WorkListSet.insert(EmitLoc);
+
+    if (!EnhancedLineinfo) // No enhanced lineinfo, we are done.
+      break;
+
+    const DILocation *IA = EmitLoc->getInlinedAt();
+    // Check if this has inlined_at information, and if we have not yet
+    // emitted the .loc for the inlined_at location.
+    if (IA && InlinedAtLocs.contains(IA))
+      EmitLoc = IA;
+    else // We are done
+      break;
+  }
+
+  const unsigned CUID = Asm->OutStreamer->getContext().getDwarfCompileUnitID();
+  // Traverse the work list, and emit .loc.
+  while (!WorkList.empty()) {
+    const DILocation *Current = WorkList.pop_back_val();
+    const DIScope *Scope = Current->getScope();
+
+    if (!Scope)
+      llvm_unreachable("we shouldn't be here for null scope");
+
+    const DILocation *InlinedAt = Current->getInlinedAt();
+    StringRef Fn = Scope->getFilename();
+    const unsigned Line = Current->getLine();
+    const unsigned Col = Current->getColumn();
+    unsigned Discriminator = 0;
+    if (Line != 0 && getDwarfVersion() >= 4)
+      if (const DILexicalBlockFile *LBF = dyn_cast<DILexicalBlockFile>(Scope))
+        Discriminator = LBF->getDiscriminator();
+
+    const unsigned FileNo = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+                                .getOrCreateSourceID(Scope->getFile());
+    // Remove this location from the work list if it is in the inlined_at
+    // locations set.
+    if (EnhancedLineinfo && InlinedAtLocs.contains(Current))
+      InlinedAtLocs.erase(Current);
+
+    if (EnhancedLineinfo && InlinedAt) {
+      const unsigned FileIA = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+                                  .getOrCreateSourceID(InlinedAt->getFile());
+      const DISubprogram *SubProgram = getDISubprogram(Current->getScope());
+      DwarfStringPoolEntryRef Entry = InfoHolder.getStringPool().getEntry(
+          *Asm, SubProgram->getLinkageName());
+      Asm->OutStreamer->emitDwarfLocDirectiveWithInlinedAt(
+          FileNo, Line, Col, FileIA, InlinedAt->getLine(),
+          InlinedAt->getColumn(), Entry.getSymbol(), Flags, 0, Discriminator,
+          Fn);
+    } else {
+      Asm->OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
+                                              Discriminator, Fn);
+    }
+  }
+}
+
+// NVPTX-specific function initialization hook.
+void NVPTXDwarfDebug::beginFunctionHook(const MachineFunction &MF) {
+  InlinedAtLocs.clear();
+  collectInlinedAtLocations(MF);
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+                                           const DebugLoc &DL, unsigned Flags) {
+  // Call NVPTX-specific implementation that handles inlined_at.
+  recordSourceLineAndInlinedAt(MI, Flags);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
new file mode 100644
index 0000000000000..d7032725134e6
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -0,0 +1,52 @@
+//===-- NVPTXDwarfDebug.h - NVPTX DwarfDebug Implementation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares helper classes and functions for NVPTX-specific debug
+// information processing, particularly for inlined function call sites and
+// enhanced line information with inlined_at directives.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+
+#include "../../CodeGen/AsmPrinter/DwarfCompileUnit.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace llvm {
+
+/// NVPTXDwarfDebug - NVPTX-specific DwarfDebug implementation.
+/// Inherits from DwarfDebug to provide enhanced line information with
+/// inlined_at support.
+class NVPTXDwarfDebug : public DwarfDebug {
+private:
+  /// Set of InlinedAt locations, used to track if these have been emitted.
+  DenseSet<const DILocation *> InlinedAtLocs;
+
+public:
+  /// Constructor - Pass through to DwarfDebug constructor.
+  NVPTXDwarfDebug(AsmPrinter *A);
+
+  /// Collect all inlined_at locations for the current function.
+  void collectInlinedAtLocations(const MachineFunction &MF);
+
+protected:
+  /// Override hook to collect inlined_at locations.
+  void beginFunctionHook(const MachineFunction &MF) override;
+  /// Override hook to record source line information with inlined_at support.
+  void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+                            unsigned Flags) override;
+
+private:
+  /// NVPTX-specific source line recording with inlined_at support.
+  void recordSourceLineAndInlinedAt(const MachineInstr &MI, unsigned Flags);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
new file mode 100644
index 0000000000000..4ce97900be584
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test mutual recursion with deep inlining - verifies that inlined_at information
+;; is correctly emitted for multiple levels of inlining when foo() and bar() call each other.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+;   calculate();
+; }
+;
+; __device__ void bar() {
+;   if (gg > 17)
+;     foo();
+;   calculate();
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 21
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 16
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+define internal fastcc void @_Z3foov() unnamed_addr  !dbg !4 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+  br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @_Z3barv(), !dbg !8
+  br label %if.end, !dbg !8
+
+if.end:                                           ; preds = %if.then, %entry
+  tail call void @_Z9calculatev(), !dbg !10
+  ret void, !dbg !11
+}
+
+define internal fastcc void @_Z3barv() unnamed_addr  !dbg !12 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !13
+  %cmp = icmp sgt i32 %tmp, 17, !dbg !13
+  br i1 %cmp, label %if.then, label %if.end, !dbg !13
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @_Z3foov(), !dbg !15
+  br label %if.end, !dbg !15
+
+if.end:                                           ; preds = %if.then, %entry
+  tail call void @_Z9calculatev(), !dbg !17
+  ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr
+
+; Function Attrs: alwaysinline
+define void @_Z6kernelv() #1 !dbg !19 {
+entry:
+  %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !20
+  %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !20
+  br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !20
+
+if.then.i:                                        ; preds = %entry
+  %cmp.i2 = icmp sgt i32 %tmp.i, 17, !dbg !23
+  br i1 %cmp.i2, label %if.then.i10, label %_Z3barv.exit, !dbg !23
+
+if.then.i10:                                      ; preds = %if.then.i
+  tail call fastcc void @_Z3foov(), !dbg !25
+  tail call void @_Z9calculatev(), !dbg !28
+  tail call void @_Z9calculatev(), !dbg !29
+  br label %_Z3barv.exit, !dbg !30
+
+_Z3barv.exit:                                     ; preds = %if.then.i, %if.then.i10
+  tail call void @_Z9calculatev(), !dbg !31
+  br label %_Z3foov.exit, !dbg !32
+
+_Z3foov.exit:                                     ; preds = %entry, %_Z3barv.exit
+  tail call void @_Z9calculatev(), !dbg !33
+  ret void, !dbg !34
+}
+
+attributes #1 = { alwaysinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t1.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 10, column: 5, scope: !9)
+!9 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!10 = !DILocation(line: 11, column: 3, scope: !7)
+!11 = !DILocation(line: 12, column: 1, scope: !7)
+!12 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!13 = !DILocation(line: 15, column: 3, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !12, file: !1, line: 14, column: 29)
+!15 = !DILocation(line: 16, column: 5, scope: !16)
+!16 = distinct !DILexicalBlock(scope: !14, file: !1, line: 15, column: 3)
+!17 = !DILocation(line: 17, column: 3, scope: !14)
+!18 = !DILocation(line: 18, column: 1, scope: !14)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!20 = !DILocation(line: 9, column: 3, scope: !7, inlinedAt: !21)
+!21 = distinct !DILocation(line: 21, column: 3, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!23 = !DILocation(line: 15, column: 3, scope: !14, inlinedAt: !24)
+!24 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!25 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !26)
+!26 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !27)
+!27 = distinct !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!28 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !26)
+!29 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !27)
+!30 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!31 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !24)
+!32 = !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!33 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !21)
+!34 = !DILocation(line: 22, column: 1, scope: !22)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
new file mode 100644
index 0000000000000..47e4655a3ef54
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test simple two-level inlining - verifies that inlined_at information is correctly
+;; emitted when foo() calls bar() and kernel() calls foo().
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+;
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+; }
+;
+; __device__ void bar() {
+;   ++gg;
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 18
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelv() #0 !dbg !5 {
+entry:
+  %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !7
+  %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !7
+  br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !7
+
+if.then.i:                                        ; preds = %entry
+  %inc.i.i = add nuw nsw i32 %tmp.i, 1, !dbg !12
+  store i32 %inc.i.i, ptr addrspace(1) @gg, align 4, !dbg !12
+  br label %_Z3foov.exit, !dbg !17
+
+_Z3foov.exit:                                     ; preds = %entry, %if.then.i
+  ret void, !dbg !18
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t2.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelv, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 9, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 8, column: 29)
+!9 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 18, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 17, column: 29)
+!12 = !DILocation(line: 14, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 13, column: 29)
+!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 13, type: !6, scopeLine: 13, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!16 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9, column: 3)
+!17 = !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!18 = !DILocation(line: 19, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
new file mode 100644
index 0000000000000..5b94a249ab64d
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test inlining of a C++ constructor with control flow - verifies that inlined_at
+;; information is correctly emitted when a constructor containing loops and conditionals is inlined.
+;
+; __device__ int gg;
+; __device__ int *arr;
+;
+; class C {
+;   int priv;
+;   public: __device__ C();
+;   __device__ C(int);
+;   __device__ int get() const;
+; };
+;
+;
+; __device__ C::C() : priv(1) {
+;   int sum = 0;
+;   for (int i = 0; i < gg; ++i) sum += arr[i];
+;   if (sum > 17)
+;     priv = sum;
+; }
+;
+; __device__ C::C(int n) : priv(n) {}
+;
+; __device__ int C::get() const { return priv; }
+;
+; __global__ void kernel(int n) {
+;   C c1;
+;   if (n > 7)
+;     gg = c1.get();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 24
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[CTORNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[CTORNAME]]:
+; CHECK-NEXT: // {{.*}} _ZN1CC1Ev
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at arr = internal addrspace(1) global ptr null, align 8
+ at llvm.used = appending global [3 x ptr] [ptr @_Z6kerneli, ptr addrspacecast (ptr addrspace(1) @arr to ptr), ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none)
+define void @_Z6kerneli(i32 noundef %n) #0 !dbg !4 {
+entry:
+  %tmp3.i1 = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp.i2 = icmp sgt i32 %tmp3.i1, 0, !dbg !6
+  br i1 %cmp.i2, label %for.body.i.preheader, label %for.end.i, !dbg !6
+
+for.body.i.preheader:                             ; preds = %entry
+  %tmp4.i = load ptr, ptr addrspace(1) @arr, align 8
+  %0 = addrspacecast ptr %tmp4.i to ptr addrspace(1)
+  %xtraiter = and i32 %tmp3.i1, 15, !dbg !6
+  %1 = icmp samesign ult i32 %tmp3.i1, 16, !dbg !6
+  br i1 %1, label %for.end.i.loopexit.unr-lcssa, label %for.body.i.preheader.new, !dbg !6
+
+for.body.i.preheader.new:                         ; preds = %for.body.i.preheader
+  %unroll_iter = and i32 %tmp3.i1, 2147483632, !dbg !6
+  br label %for.body.i, !dbg !6
+
+for.body.i:                                       ; preds = %for.body.i, %for.body.i.preheader.new
+  %i.0.i4 = phi i32 [ 0, %for.body.i.preheader.new ], [ %inc.i.15, %for.body.i ]
+  %sum.0.i3 = phi i32 [ 0, %for.body.i.preheader.new ], [ %add.i.15, %for.body.i ]
+  %niter = phi i32 [ 0, %for.body.i.preheader.new ], [ %niter.next.15, %for.body.i ]
+  %2 = zext nneg i32 %i.0.i4 to i64, !dbg !6
+  %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %2, !dbg !6
+  %tmp6.i = load i32, ptr addrspace(1) %getElem, align 4, !dbg !6
+  %add.i = add nsw i32 %tmp6.i, %sum.0.i3, !dbg !6
+  %inc.i = add nuw nsw i32 %i.0.i4, 1, !dbg !6
+  %3 = zext nneg i32 %inc.i to i64, !dbg !6
+  %getElem.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %3, !dbg !6
+  %tmp6.i.1 = load i32, ptr addrspace(1) %getElem.1, align 4, !dbg !6
+  %add.i.1 = add nsw i32 %tmp6.i.1, %add.i, !dbg !6
+  %inc.i.1 = add nuw nsw i32 %i.0.i4, 2, !dbg !6
+  %4 = zext nneg i32 %inc.i.1 to i64, !dbg !6
+  %getElem.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %4, !dbg !6
+  %tmp6.i.2 = load i32, ptr addrspace(1) %getElem.2, align 4, !dbg !6
+  %add.i.2 = add nsw i32 %tmp6.i.2, %add.i.1, !dbg !6
+  %inc.i.2 = add nuw nsw i32 %i.0.i4, 3, !dbg !6
+  %5 = zext nneg i32 %inc.i.2 to i64, !dbg !6
+  %getElem.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %5, !dbg !6
+  %tmp6.i.3 = load i32, ptr addrspace(1) %getElem.3, align 4, !dbg !6
+  %add.i.3 = add nsw i32 %tmp6.i.3, %add.i.2, !dbg !6
+  %inc.i.3 = add nuw nsw i32 %i.0.i4, 4, !dbg !6
+  %6 = zext nneg i32 %inc.i.3 to i64, !dbg !6
+  %getElem.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %6, !dbg !6
+  %tmp6.i.4 = load i32, ptr addrspace(1) %getElem.4, align 4, !dbg !6
+  %add.i.4 = add nsw i32 %tmp6.i.4, %add.i.3, !dbg !6
+  %inc.i.4 = add nuw nsw i32 %i.0.i4, 5, !dbg !6
+  %7 = zext nneg i32 %inc.i.4 to i64, !dbg !6
+  %getElem.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %7, !dbg !6
+  %tmp6.i.5 = load i32, ptr addrspace(1) %getElem.5, align 4, !dbg !6
+  %add.i.5 = add nsw i32 %tmp6.i.5, %add.i.4, !dbg !6
+  %inc.i.5 = add nuw nsw i32 %i.0.i4, 6, !dbg !6
+  %8 = zext nneg i32 %inc.i.5 to i64, !dbg !6
+  %getElem.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %8, !dbg !6
+  %tmp6.i.6 = load i32, ptr addrspace(1) %getElem.6, align 4, !dbg !6
+  %add.i.6 = add nsw i32 %tmp6.i.6, %add.i.5, !dbg !6
+  %inc.i.6 = add nuw nsw i32 %i.0.i4, 7, !dbg !6
+  %9 = zext nneg i32 %inc.i.6 to i64, !dbg !6
+  %getElem.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %9, !dbg !6
+  %tmp6.i.7 = load i32, ptr addrspace(1) %getElem.7, align 4, !dbg !6
+  %add.i.7 = add nsw i32 %tmp6.i.7, %add.i.6, !dbg !6
+  %inc.i.7 = add nuw nsw i32 %i.0.i4, 8, !dbg !6
+  %10 = zext nneg i32 %inc.i.7 to i64, !dbg !6
+  %getElem.8 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %10, !dbg !6
+  %tmp6.i.8 = load i32, ptr addrspace(1) %getElem.8, align 4, !dbg !6
+  %add.i.8 = add nsw i32 %tmp6.i.8, %add.i.7, !dbg !6
+  %inc.i.8 = add nuw nsw i32 %i.0.i4, 9, !dbg !6
+  %11 = zext nneg i32 %inc.i.8 to i64, !dbg !6
+  %getElem.9 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %11, !dbg !6
+  %tmp6.i.9 = load i32, ptr addrspace(1) %getElem.9, align 4, !dbg !6
+  %add.i.9 = add nsw i32 %tmp6.i.9, %add.i.8, !dbg !6
+  %inc.i.9 = add nuw nsw i32 %i.0.i4, 10, !dbg !6
+  %12 = zext nneg i32 %inc.i.9 to i64, !dbg !6
+  %getElem.10 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %12, !dbg !6
+  %tmp6.i.10 = load i32, ptr addrspace(1) %getElem.10, align 4, !dbg !6
+  %add.i.10 = add nsw i32 %tmp6.i.10, %add.i.9, !dbg !6
+  %inc.i.10 = add nuw nsw i32 %i.0.i4, 11, !dbg !6
+  %13 = zext nneg i32 %inc.i.10 to i64, !dbg !6
+  %getElem.11 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %13, !dbg !6
+  %tmp6.i.11 = load i32, ptr addrspace(1) %getElem.11, align 4, !dbg !6
+  %add.i.11 = add nsw i32 %tmp6.i.11, %add.i.10, !dbg !6
+  %inc.i.11 = add nuw nsw i32 %i.0.i4, 12, !dbg !6
+  %14 = zext nneg i32 %inc.i.11 to i64, !dbg !6
+  %getElem.12 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %14, !dbg !6
+  %tmp6.i.12 = load i32, ptr addrspace(1) %getElem.12, align 4, !dbg !6
+  %add.i.12 = add nsw i32 %tmp6.i.12, %add.i.11, !dbg !6
+  %inc.i.12 = add nuw nsw i32 %i.0.i4, 13, !dbg !6
+  %15 = zext nneg i32 %inc.i.12 to i64, !dbg !6
+  %getElem.13 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %15, !dbg !6
+  %tmp6.i.13 = load i32, ptr addrspace(1) %getElem.13, align 4, !dbg !6
+  %add.i.13 = add nsw i32 %tmp6.i.13, %add.i.12, !dbg !6
+  %inc.i.13 = add nuw nsw i32 %i.0.i4, 14, !dbg !6
+  %16 = zext nneg i32 %inc.i.13 to i64, !dbg !6
+  %getElem.14 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %16, !dbg !6
+  %tmp6.i.14 = load i32, ptr addrspace(1) %getElem.14, align 4, !dbg !6
+  %add.i.14 = add nsw i32 %tmp6.i.14, %add.i.13, !dbg !6
+  %inc.i.14 = add nuw nsw i32 %i.0.i4, 15, !dbg !6
+  %17 = zext nneg i32 %inc.i.14 to i64, !dbg !6
+  %getElem.15 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %17, !dbg !6
+  %tmp6.i.15 = load i32, ptr addrspace(1) %getElem.15, align 4, !dbg !6
+  %add.i.15 = add nsw i32 %tmp6.i.15, %add.i.14, !dbg !6
+  %inc.i.15 = add nuw nsw i32 %i.0.i4, 16, !dbg !6
+  %niter.next.15 = add i32 %niter, 16, !dbg !6
+  %niter.ncmp.15.not = icmp eq i32 %niter.next.15, %unroll_iter, !dbg !6
+  br i1 %niter.ncmp.15.not, label %for.end.i.loopexit.unr-lcssa, label %for.body.i, !dbg !6, !llvm.loop !11
+
+for.end.i.loopexit.unr-lcssa:                     ; preds = %for.body.i, %for.body.i.preheader
+  %add.i.lcssa.ph = phi i32 [ poison, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+  %i.0.i4.unr = phi i32 [ 0, %for.body.i.preheader ], [ %inc.i.15, %for.body.i ]
+  %sum.0.i3.unr = phi i32 [ 0, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0, !dbg !6
+  br i1 %lcmp.mod.not, label %for.end.i, label %for.body.i.epil.preheader, !dbg !6
+
+for.body.i.epil.preheader:                        ; preds = %for.end.i.loopexit.unr-lcssa
+  %xtraiter6 = and i32 %tmp3.i1, 7, !dbg !6
+  %18 = icmp samesign ult i32 %xtraiter, 8, !dbg !6
+  br i1 %18, label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, label %for.body.i.epil, !dbg !6
+
+for.body.i.epil:                                  ; preds = %for.body.i.epil.preheader
+  %19 = zext nneg i32 %i.0.i4.unr to i64, !dbg !6
+  %getElem.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %19, !dbg !6
+  %tmp6.i.epil = load i32, ptr addrspace(1) %getElem.epil, align 4, !dbg !6
+  %add.i.epil = add nsw i32 %tmp6.i.epil, %sum.0.i3.unr, !dbg !6
+  %inc.i.epil = add nuw nsw i32 %i.0.i4.unr, 1, !dbg !6
+  %20 = zext nneg i32 %inc.i.epil to i64, !dbg !6
+  %getElem.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %20, !dbg !6
+  %tmp6.i.epil.1 = load i32, ptr addrspace(1) %getElem.epil.1, align 4, !dbg !6
+  %add.i.epil.1 = add nsw i32 %tmp6.i.epil.1, %add.i.epil, !dbg !6
+  %inc.i.epil.1 = add nuw nsw i32 %i.0.i4.unr, 2, !dbg !6
+  %21 = zext nneg i32 %inc.i.epil.1 to i64, !dbg !6
+  %getElem.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %21, !dbg !6
+  %tmp6.i.epil.2 = load i32, ptr addrspace(1) %getElem.epil.2, align 4, !dbg !6
+  %add.i.epil.2 = add nsw i32 %tmp6.i.epil.2, %add.i.epil.1, !dbg !6
+  %inc.i.epil.2 = add nuw nsw i32 %i.0.i4.unr, 3, !dbg !6
+  %22 = zext nneg i32 %inc.i.epil.2 to i64, !dbg !6
+  %getElem.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %22, !dbg !6
+  %tmp6.i.epil.3 = load i32, ptr addrspace(1) %getElem.epil.3, align 4, !dbg !6
+  %add.i.epil.3 = add nsw i32 %tmp6.i.epil.3, %add.i.epil.2, !dbg !6
+  %inc.i.epil.3 = add nuw nsw i32 %i.0.i4.unr, 4, !dbg !6
+  %23 = zext nneg i32 %inc.i.epil.3 to i64, !dbg !6
+  %getElem.epil.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %23, !dbg !6
+  %tmp6.i.epil.4 = load i32, ptr addrspace(1) %getElem.epil.4, align 4, !dbg !6
+  %add.i.epil.4 = add nsw i32 %tmp6.i.epil.4, %add.i.epil.3, !dbg !6
+  %inc.i.epil.4 = add nuw nsw i32 %i.0.i4.unr, 5, !dbg !6
+  %24 = zext nneg i32 %inc.i.epil.4 to i64, !dbg !6
+  %getElem.epil.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %24, !dbg !6
+  %tmp6.i.epil.5 = load i32, ptr addrspace(1) %getElem.epil.5, align 4, !dbg !6
+  %add.i.epil.5 = add nsw i32 %tmp6.i.epil.5, %add.i.epil.4, !dbg !6
+  %inc.i.epil.5 = add nuw nsw i32 %i.0.i4.unr, 6, !dbg !6
+  %25 = zext nneg i32 %inc.i.epil.5 to i64, !dbg !6
+  %getElem.epil.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %25, !dbg !6
+  %tmp6.i.epil.6 = load i32, ptr addrspace(1) %getElem.epil.6, align 4, !dbg !6
+  %add.i.epil.6 = add nsw i32 %tmp6.i.epil.6, %add.i.epil.5, !dbg !6
+  %inc.i.epil.6 = add nuw nsw i32 %i.0.i4.unr, 7, !dbg !6
+  %26 = zext nneg i32 %inc.i.epil.6 to i64, !dbg !6
+  %getElem.epil.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %26, !dbg !6
+  %tmp6.i.epil.7 = load i32, ptr addrspace(1) %getElem.epil.7, align 4, !dbg !6
+  %add.i.epil.7 = add nsw i32 %tmp6.i.epil.7, %add.i.epil.6, !dbg !6
+  %inc.i.epil.7 = add nuw nsw i32 %i.0.i4.unr, 8, !dbg !6
+  br label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.unr-lcssa:        ; preds = %for.body.i.epil, %for.body.i.epil.preheader
+  %add.i.lcssa.ph5.ph = phi i32 [ poison, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+  %i.0.i4.epil.unr = phi i32 [ %i.0.i4.unr, %for.body.i.epil.preheader ], [ %inc.i.epil.7, %for.body.i.epil ]
+  %sum.0.i3.epil.unr = phi i32 [ %sum.0.i3.unr, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+  %lcmp.mod8.not = icmp eq i32 %xtraiter6, 0, !dbg !6
+  br i1 %lcmp.mod8.not, label %for.end.i, label %for.body.i.epil.epil.preheader, !dbg !6
+
+for.body.i.epil.epil.preheader:                   ; preds = %for.end.i.loopexit.epilog-lcssa.unr-lcssa
+  %xtraiter12 = and i32 %tmp3.i1, 3, !dbg !6
+  %27 = icmp samesign ult i32 %xtraiter6, 4, !dbg !6
+  br i1 %27, label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, label %for.body.i.epil.epil, !dbg !6
+
+for.body.i.epil.epil:                             ; preds = %for.body.i.epil.epil.preheader
+  %28 = zext nneg i32 %i.0.i4.epil.unr to i64, !dbg !6
+  %getElem.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %28, !dbg !6
+  %tmp6.i.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil, align 4, !dbg !6
+  %add.i.epil.epil = add nsw i32 %tmp6.i.epil.epil, %sum.0.i3.epil.unr, !dbg !6
+  %inc.i.epil.epil = add nuw nsw i32 %i.0.i4.epil.unr, 1, !dbg !6
+  %29 = zext nneg i32 %inc.i.epil.epil to i64, !dbg !6
+  %getElem.epil.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %29, !dbg !6
+  %tmp6.i.epil.epil.1 = load i32, ptr addrspace(1) %getElem.epil.epil.1, align 4, !dbg !6
+  %add.i.epil.epil.1 = add nsw i32 %tmp6.i.epil.epil.1, %add.i.epil.epil, !dbg !6
+  %inc.i.epil.epil.1 = add nuw nsw i32 %i.0.i4.epil.unr, 2, !dbg !6
+  %30 = zext nneg i32 %inc.i.epil.epil.1 to i64, !dbg !6
+  %getElem.epil.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %30, !dbg !6
+  %tmp6.i.epil.epil.2 = load i32, ptr addrspace(1) %getElem.epil.epil.2, align 4, !dbg !6
+  %add.i.epil.epil.2 = add nsw i32 %tmp6.i.epil.epil.2, %add.i.epil.epil.1, !dbg !6
+  %inc.i.epil.epil.2 = add nuw nsw i32 %i.0.i4.epil.unr, 3, !dbg !6
+  %31 = zext nneg i32 %inc.i.epil.epil.2 to i64, !dbg !6
+  %getElem.epil.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %31, !dbg !6
+  %tmp6.i.epil.epil.3 = load i32, ptr addrspace(1) %getElem.epil.epil.3, align 4, !dbg !6
+  %add.i.epil.epil.3 = add nsw i32 %tmp6.i.epil.epil.3, %add.i.epil.epil.2, !dbg !6
+  %inc.i.epil.epil.3 = add nuw nsw i32 %i.0.i4.epil.unr, 4, !dbg !6
+  br label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa: ; preds = %for.body.i.epil.epil, %for.body.i.epil.epil.preheader
+  %add.i.lcssa.ph5.ph9.ph = phi i32 [ poison, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+  %i.0.i4.epil.epil.unr = phi i32 [ %i.0.i4.epil.unr, %for.body.i.epil.epil.preheader ], [ %inc.i.epil.epil.3, %for.body.i.epil.epil ]
+  %sum.0.i3.epil.epil.unr = phi i32 [ %sum.0.i3.epil.unr, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+  %lcmp.mod14.not = icmp eq i32 %xtraiter12, 0, !dbg !6
+  br i1 %lcmp.mod14.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6
+
+for.body.i.epil.epil.epil:                        ; preds = %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.body.i.epil.epil.epil
+  %i.0.i4.epil.epil.epil = phi i32 [ %inc.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %i.0.i4.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %sum.0.i3.epil.epil.epil = phi i32 [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %sum.0.i3.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %epil.iter13 = phi i32 [ %epil.iter13.next, %for.body.i.epil.epil.epil ], [ 0, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %32 = zext nneg i32 %i.0.i4.epil.epil.epil to i64, !dbg !6
+  %getElem.epil.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %32, !dbg !6
+  %tmp6.i.epil.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil.epil, align 4, !dbg !6
+  %add.i.epil.epil.epil = add nsw i32 %tmp6.i.epil.epil.epil, %sum.0.i3.epil.epil.epil, !dbg !6
+  %inc.i.epil.epil.epil = add nuw nsw i32 %i.0.i4.epil.epil.epil, 1, !dbg !6
+  %epil.iter13.next = add i32 %epil.iter13, 1, !dbg !6
+  %epil.iter13.cmp.not = icmp eq i32 %epil.iter13.next, %xtraiter12, !dbg !6
+  br i1 %epil.iter13.cmp.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6, !llvm.loop !14
+
+for.end.i:                                        ; preds = %for.body.i.epil.epil.epil, %for.end.i.loopexit.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.unr-lcssa, %entry
+  %sum.0.i.lcssa = phi i32 [ 0, %entry ], [ %add.i.lcssa.ph, %for.end.i.loopexit.unr-lcssa ], [ %add.i.lcssa.ph5.ph, %for.end.i.loopexit.epilog-lcssa.unr-lcssa ], [ %add.i.lcssa.ph5.ph9.ph, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ], [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], !dbg !6
+  %cmp10.i = icmp sgt i32 %sum.0.i.lcssa, 17, !dbg !15
+  %spec.select = select i1 %cmp10.i, i32 %sum.0.i.lcssa, i32 1, !dbg !15
+  %cmp = icmp sgt i32 %n, 7, !dbg !16
+  br i1 %cmp, label %if.then, label %if.end, !dbg !16
+
+if.then:                                          ; preds = %for.end.i
+  store i32 %spec.select, ptr addrspace(1) @gg, align 4, !dbg !17
+  br label %if.end, !dbg !17
+
+if.end:                                           ; preds = %if.then, %for.end.i
+  ret void, !dbg !19
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t3.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kerneli", scope: !1, file: !1, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 14, column: 3, scope: !7, inlinedAt: !9)
+!7 = distinct !DILexicalBlock(scope: !8, file: !1, line: 12, column: 27)
+!8 = distinct !DISubprogram(name: "C", linkageName: "_ZN1CC1Ev", scope: !1, file: !1, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = distinct !DILocation(line: 24, column: 3, scope: !10)
+!10 = distinct !DILexicalBlock(scope: !4, file: !1, line: 23, column: 29)
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = !{!"llvm.loop.unroll.disable"}
+!14 = distinct !{!14, !13}
+!15 = !DILocation(line: 15, column: 3, scope: !7, inlinedAt: !9)
+!16 = !DILocation(line: 25, column: 3, scope: !10)
+!17 = !DILocation(line: 26, column: 5, scope: !18)
+!18 = distinct !DILexicalBlock(scope: !10, file: !1, line: 25, column: 3)
+!19 = !DILocation(line: 27, column: 1, scope: !10)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
new file mode 100644
index 0000000000000..57df39e8034c4
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
@@ -0,0 +1,229 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test multiple inline calls at the same level - verifies that inlined_at information
+;; is correctly emitted when multiple different functions (or multiple copies of the same function) are inlined into a single caller.
+;
+; __device__ __forceinline__ int foo(int a)
+; {
+;   if (a > 7)
+;     return a*a;
+;   return ++a;
+; }
+;
+; __device__ __forceinline__ int baz(int a)
+; {
+;   if (a > 23)
+;     return a*2;
+;   return ++a;
+; }
+;
+; __device__ int bar(int i, int j)
+; {
+;   return i + j;
+; }
+;
+; __device__ int d;
+;
+; // inlining two different functions
+; __global__ void kernel1(int x, int y)
+; {
+;   d = bar(foo(x), baz(y));
+; }
+;
+; // inlining two copies of same function
+; __global__ void kernel2(int x, int y)
+; {
+;   d = bar(foo(x), foo(y));
+; }
+;
+; // inlining two different functions, extra computation in caller (y+x)
+; __global__ void kernel3(int x, int y)
+; {
+;   d = bar(foo(x), baz(y + x));
+; }
+;
+; // inlining two copies of same function, extra computation in caller (y+x)
+; __global__ void kernel4(int x, int y)
+; {
+;   d = bar(foo(x), foo(y + x));
+; }
+;
+; CHECK: .entry _Z7kernel1ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+;
+; CHECK: .entry _Z7kernel2ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 31
+;
+; CHECK: .entry _Z7kernel3ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 37
+;
+; CHECK: .entry _Z7kernel4ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 43
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3fooi
+; CHECK: [[BAZNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3bazi
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at d = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [5 x ptr] [ptr @_Z7kernel1ii, ptr @_Z7kernel2ii, ptr @_Z7kernel3ii, ptr @_Z7kernel4ii, ptr addrspacecast (ptr addrspace(1) @d to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel1ii(i32 noundef %x, i32 noundef %y) #0 !dbg !11 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !13
+  %mul.i = mul nsw i32 %x, %x, !dbg !13
+  %inc.i = add nsw i32 %x, 1, !dbg !13
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !13
+  %cmp.i1 = icmp sgt i32 %y, 23, !dbg !18
+  %mul.i2 = shl nuw nsw i32 %y, 1, !dbg !18
+  %inc.i3 = add nsw i32 %y, 1, !dbg !18
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !18
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !22
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !26
+  ret void, !dbg !27
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel2ii(i32 noundef %x, i32 noundef %y) #0 !dbg !28 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !29
+  %mul.i = mul nsw i32 %x, %x, !dbg !29
+  %inc.i = add nsw i32 %x, 1, !dbg !29
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !29
+  %cmp.i1 = icmp sgt i32 %y, 7, !dbg !32
+  %mul.i2 = mul nsw i32 %y, %y, !dbg !32
+  %inc.i3 = add nsw i32 %y, 1, !dbg !32
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !32
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !34
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel3ii(i32 noundef %x, i32 noundef %y) #0 !dbg !38 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !39
+  %mul.i = mul nsw i32 %x, %x, !dbg !39
+  %inc.i = add nsw i32 %x, 1, !dbg !39
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !39
+  %add = add nsw i32 %y, %x, !dbg !42
+  %cmp.i1 = icmp sgt i32 %add, 23, !dbg !43
+  %mul.i2 = shl nuw nsw i32 %add, 1, !dbg !43
+  %inc.i3 = add nsw i32 %add, 1, !dbg !43
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !43
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !45
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !42
+  ret void, !dbg !47
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel4ii(i32 noundef %x, i32 noundef %y) #0 !dbg !48 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !49
+  %mul.i = mul nsw i32 %x, %x, !dbg !49
+  %inc.i = add nsw i32 %x, 1, !dbg !49
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !49
+  %add = add nsw i32 %y, %x, !dbg !52
+  %cmp.i1 = icmp sgt i32 %add, 7, !dbg !53
+  %mul.i2 = mul nsw i32 %add, %add, !dbg !53
+  %inc.i3 = add nsw i32 %add, 1, !dbg !53
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !53
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !55
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !52
+  ret void, !dbg !57
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvmir.version = !{!2, !3, !4}
+!nvvm.annotations = !{!5, !6, !7, !8}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t4.cu", directory: "")
+!2 = !{i32 2, i32 0, i32 3, i32 2}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, i32 0, i32 3, i32 1}
+!5 = !{ptr @_Z7kernel1ii, !"kernel", i32 1}
+!6 = !{ptr @_Z7kernel2ii, !"kernel", i32 1}
+!7 = !{ptr @_Z7kernel3ii, !"kernel", i32 1}
+!8 = !{ptr @_Z7kernel4ii, !"kernel", i32 1}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{}
+!11 = distinct !DISubprogram(name: "kernel1", linkageName: "_Z7kernel1ii", scope: !1, file: !1, line: 23, type: !12, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!12 = !DISubroutineType(types: !10)
+!13 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlock(scope: !15, file: !1, line: 2, column: 1)
+!15 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !11, file: !1, line: 24, column: 1)
+!18 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 9, column: 1)
+!20 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazi", scope: !1, file: !1, line: 8, type: !12, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!22 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 16, column: 1)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", scope: !1, file: !1, line: 15, type: !12, scopeLine: 15, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!26 = !DILocation(line: 25, column: 3, scope: !17)
+!27 = !DILocation(line: 26, column: 1, scope: !17)
+!28 = distinct !DISubprogram(name: "kernel2", linkageName: "_Z7kernel2ii", scope: !1, file: !1, line: 29, type: !12, scopeLine: 29, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!29 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !30)
+!30 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 30, column: 1)
+!32 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !33)
+!33 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!34 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !35)
+!35 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!36 = !DILocation(line: 31, column: 3, scope: !31)
+!37 = !DILocation(line: 32, column: 1, scope: !31)
+!38 = distinct !DISubprogram(name: "kernel3", linkageName: "_Z7kernel3ii", scope: !1, file: !1, line: 35, type: !12, scopeLine: 35, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!39 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !40)
+!40 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!41 = distinct !DILexicalBlock(scope: !38, file: !1, line: 36, column: 1)
+!42 = !DILocation(line: 37, column: 3, scope: !41)
+!43 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !44)
+!44 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!45 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !46)
+!46 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!47 = !DILocation(line: 38, column: 1, scope: !41)
+!48 = distinct !DISubprogram(name: "kernel4", linkageName: "_Z7kernel4ii", scope: !1, file: !1, line: 41, type: !12, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!49 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !50)
+!50 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !48, file: !1, line: 42, column: 1)
+!52 = !DILocation(line: 43, column: 3, scope: !51)
+!53 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !54)
+!54 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!55 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !56)
+!56 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!57 = !DILocation(line: 44, column: 1, scope: !51)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
new file mode 100644
index 0000000000000..dc242a95f54d8
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test deep inline chain - verifies that inlined_at information is correctly emitted
+;; through an 11-level deep chain of inlining (foo0 through foo10).
+;
+; __device__ int foo0(int a, int b) {
+;   if (a > b)
+;     return --a;
+;   if (a > b - 7)
+;     return a*2;
+;   return ++a;
+; }
+; __device__ int foo1(int a, int b) {
+;   return foo0(a*3, b*b);
+; }
+; __device__ int foo2(int a, int b) {
+;   return foo1(a+2, b*b);
+; }
+; __device__ int foo3(int a, int b) {
+;   return foo2(a+100, b-7);
+; }
+; __device__ int foo4(int a, int b) {
+;   return foo3(a*a, b*3);
+; }
+; __device__ int foo5(int a, int b) {
+;   return foo4(a*3, b*3);
+; }
+; __device__ int foo6(int a, int b) {
+;   return foo5(a*3, b*3);
+; }
+; __device__ int foo7(int a, int b) {
+;   return foo6(a*a + 2, b*b*5);
+; }
+; __device__ int foo8(int a, int b) {
+;   return foo7(a*2, b*2*a);
+; }
+; __device__ int foo9(int a, int b) {
+;   return foo8(a*2, b*2*a);
+; }
+; __device__ int foo10(int a, int b) {
+;   return foo9(a*2*b, b*2);
+; }
+;
+; __device__ int g;
+;
+; __global__ void kernel(int a, int b) {
+;   g = foo10(a, b);
+; }
+;
+; CHECK: .entry _Z6kernelii(
+; CHECK: .loc [[FILENUM:[1-9]]] 42
+; CHECK: .loc [[FILENUM]] 36 {{[0-9]*}}, function_name [[FOO10NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 42
+; CHECK: .loc [[FILENUM]] 33 {{[0-9]*}}, function_name [[FOO9NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 36
+; CHECK: .loc [[FILENUM]] 30 {{[0-9]*}}, function_name [[FOO8NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 33
+; CHECK: .loc [[FILENUM]] 27 {{[0-9]*}}, function_name [[FOO7NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 30
+; CHECK: .loc [[FILENUM]] 24 {{[0-9]*}}, function_name [[FOO6NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 27
+; CHECK: .loc [[FILENUM]] 21 {{[0-9]*}}, function_name [[FOO5NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .loc [[FILENUM]] 18 {{[0-9]*}}, function_name [[FOO4NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[FOO3NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 12 {{[0-9]*}}, function_name [[FOO2NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 15
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOO1NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 12
+; CHECK: .loc [[FILENUM]] 2 {{[0-9]*}}, function_name [[FOO0NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 9
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOO0NAME]], inlined_at [[FILENUM]] 9
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOO10NAME]]:
+; CHECK-NEXT: // {{.*}} _Z5foo10ii
+; CHECK: [[FOO9NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo9ii
+; CHECK: [[FOO8NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo8ii
+; CHECK: [[FOO7NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo7ii
+; CHECK: [[FOO6NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo6ii
+; CHECK: [[FOO5NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo5ii
+; CHECK: [[FOO4NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo4ii
+; CHECK: [[FOO3NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo3ii
+; CHECK: [[FOO2NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo2ii
+; CHECK: [[FOO1NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo1ii
+; CHECK: [[FOO0NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo0ii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at g = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelii, ptr addrspacecast (ptr addrspace(1) @g to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelii(i32 noundef %a, i32 noundef %b) #0 !dbg !5 {
+entry:
+  %mul.i = shl nsw i32 %a, 1, !dbg !7
+  %mul2.i = mul nsw i32 %mul.i, %b, !dbg !7
+  %mul.i.i = shl nsw i32 %mul2.i, 1, !dbg !12
+  %mul.i.i.i = shl nsw i32 %mul2.i, 2, !dbg !16
+  %mul4.i.i = shl i32 %b, 3, !dbg !12
+  %mul2.i.i.i = mul i32 %mul4.i.i, %mul2.i, !dbg !16
+  %mul4.i.i.i = mul nsw i32 %mul2.i.i.i, %mul.i.i, !dbg !16
+  %mul.i.i.i.i = mul i32 %mul2.i, 36, !dbg !20
+  %0 = mul i32 %mul.i.i.i.i, %mul.i.i.i, !dbg !24
+  %mul.i.i.i.i.i.i = add nuw nsw i32 %0, 18, !dbg !24
+  %mul4.i.i.i.i = mul i32 %mul4.i.i.i, 135, !dbg !20
+  %mul3.i.i.i.i.i.i.i = mul i32 %mul4.i.i.i.i, %mul4.i.i.i, !dbg !31
+  %sub.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i, -7, !dbg !35
+  %mul.i.i.i.i.i.i.i.i.i = mul nsw i32 %sub.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i, !dbg !39
+  %mul.i.i.i.i.i.i.i = mul i32 %mul.i.i.i.i.i.i, 3, !dbg !31
+  %1 = mul i32 %mul.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i, !dbg !43
+  %mul.i.i.i.i.i.i.i.i.i.i = add i32 %1, 306, !dbg !43
+  %mul3.i.i.i.i.i.i.i.i.i.i = mul nuw nsw i32 %mul.i.i.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i.i.i.i, !dbg !43
+  %cmp.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %mul3.i.i.i.i.i.i.i.i.i.i, !dbg !47
+  br i1 %cmp.i.i.i.i.i.i.i.i.i.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i.i, label %if.end.i.i.i.i.i.i.i.i.i.i.i, !dbg !47
+
+if.then.i.i.i.i.i.i.i.i.i.i.i:                    ; preds = %entry
+  %dec.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 305, !dbg !51
+  br label %_Z5foo10ii.exit, !dbg !51
+
+if.end.i.i.i.i.i.i.i.i.i.i.i:                     ; preds = %entry
+  %sub.i.i.i.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i.i.i.i, -7, !dbg !53
+  %cmp5.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+  br i1 %cmp5.i.i.i.i.i.i.i.i.i.i.i, label %if.then6.i.i.i.i.i.i.i.i.i.i.i, label %if.end8.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+
+if.then6.i.i.i.i.i.i.i.i.i.i.i:                   ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+  %mul.i.i.i.i.i.i.i.i.i.i.i = shl nsw i32 %mul.i.i.i.i.i.i.i.i.i.i, 1, !dbg !54
+  br label %_Z5foo10ii.exit, !dbg !54
+
+if.end8.i.i.i.i.i.i.i.i.i.i.i:                    ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+  %inc.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 307, !dbg !56
+  br label %_Z5foo10ii.exit, !dbg !56
+
+_Z5foo10ii.exit:                                  ; preds = %if.then.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i
+  %retval.0.i.i.i.i.i.i.i.i.i.i.i = phi i32 [ %dec.i.i.i.i.i.i.i.i.i.i.i, %if.then.i.i.i.i.i.i.i.i.i.i.i ], [ %mul.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i ], [ %inc.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i ], !dbg !56
+  store i32 %retval.0.i.i.i.i.i.i.i.i.i.i.i, ptr addrspace(1) @g, align 4, !dbg !57
+  ret void, !dbg !58
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "t5.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelii, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelii", scope: !1, file: !1, line: 41, type: !6, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 35, column: 28)
+!9 = distinct !DISubprogram(name: "foo10", linkageName: "_Z5foo10ii", scope: !1, file: !1, line: 35, type: !6, scopeLine: 35, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 42, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 41, column: 29)
+!12 = !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 32, column: 28)
+!14 = distinct !DISubprogram(name: "foo9", linkageName: "_Z4foo9ii", scope: !1, file: !1, line: 32, type: !6, scopeLine: 32, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!16 = !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 29, column: 28)
+!18 = distinct !DISubprogram(name: "foo8", linkageName: "_Z4foo8ii", scope: !1, file: !1, line: 29, type: !6, scopeLine: 29, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!19 = distinct !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!20 = !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 26, column: 28)
+!22 = distinct !DISubprogram(name: "foo7", linkageName: "_Z4foo7ii", scope: !1, file: !1, line: 26, type: !6, scopeLine: 26, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!23 = distinct !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!24 = !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 20, column: 28)
+!26 = distinct !DISubprogram(name: "foo5", linkageName: "_Z4foo5ii", scope: !1, file: !1, line: 20, type: !6, scopeLine: 20, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 24, column: 3, scope: !28, inlinedAt: !30)
+!28 = distinct !DILexicalBlock(scope: !29, file: !1, line: 23, column: 28)
+!29 = distinct !DISubprogram(name: "foo6", linkageName: "_Z4foo6ii", scope: !1, file: !1, line: 23, type: !6, scopeLine: 23, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!30 = distinct !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!31 = !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!32 = distinct !DILexicalBlock(scope: !33, file: !1, line: 17, column: 28)
+!33 = distinct !DISubprogram(name: "foo4", linkageName: "_Z4foo4ii", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!34 = distinct !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!35 = !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!36 = distinct !DILexicalBlock(scope: !37, file: !1, line: 14, column: 28)
+!37 = distinct !DISubprogram(name: "foo3", linkageName: "_Z4foo3ii", scope: !1, file: !1, line: 14, type: !6, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!38 = distinct !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!39 = !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!40 = distinct !DILexicalBlock(scope: !41, file: !1, line: 11, column: 28)
+!41 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2ii", scope: !1, file: !1, line: 11, type: !6, scopeLine: 11, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!42 = distinct !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!43 = !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 8, column: 28)
+!45 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ii", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!46 = distinct !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!47 = !DILocation(line: 2, column: 3, scope: !48, inlinedAt: !50)
+!48 = distinct !DILexicalBlock(scope: !49, file: !1, line: 1, column: 28)
+!49 = distinct !DISubprogram(name: "foo0", linkageName: "_Z4foo0ii", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!50 = distinct !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!51 = !DILocation(line: 3, column: 5, scope: !52, inlinedAt: !50)
+!52 = distinct !DILexicalBlock(scope: !48, file: !1, line: 2, column: 3)
+!53 = !DILocation(line: 4, column: 3, scope: !48, inlinedAt: !50)
+!54 = !DILocation(line: 5, column: 5, scope: !55, inlinedAt: !50)
+!55 = distinct !DILexicalBlock(scope: !48, file: !1, line: 4, column: 3)
+!56 = !DILocation(line: 6, column: 3, scope: !48, inlinedAt: !50)
+!57 = !DILocation(line: 42, column: 3, scope: !11)
+!58 = !DILocation(line: 43, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
new file mode 100644
index 0000000000000..8c5e48814c59b
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
@@ -0,0 +1,343 @@
+; RUN: llc < %s -mattr=+ptx72 -O0 | FileCheck %s
+;
+;; Test same function inlined multiple times from different call sites - verifies that
+;; inlined_at information correctly distinguishes multiple inline instances of rrand() and znew() called from different locations within a loop.
+;
+; typedef unsigned long long MYSIZE_T; // avoid platform dependence
+; struct GridOpt {
+;     int launch;
+;     int sync;
+;     int size;
+;     MYSIZE_T itr;
+; };
+;
+; enum LaunchOpt {
+;     LaunchOptPerThreadStream = 0,
+;     LaunchOptBlockSharedStream,
+;     LaunchOptAllNullStream,
+;     LaunchOptForkBomb,
+;     LaunchOptLoopLaunch,
+;     LaunchOptRand,
+;     LaunchOptSize,
+; };
+;
+; // post launch sync options
+; enum SyncOpt {
+;     SyncOptAllThreadsSync = 0,
+;     SyncOptOneSyncPerBlock,
+;     SyncOptRand,
+;     SyncOptSize,
+; };
+;
+; // size of launch options
+; enum SizeOpt {
+;     SizeOptSingleWarp = 0,
+;     SizeOptMultiWarp,
+;     SizeOptMultiBlock,
+;     SizeOptRand,
+;     SizeOptSize,
+; };
+;
+;
+; // device side failure codes
+; enum ErrorStatus {
+;     Success = 0,
+;     DeviceRuntimeFailure  = 1,
+;     DeviceMallocFailure   = 2,
+;     DeviceHardwareFailure = 3,
+;     InvalidInput          = 4,
+; };
+;
+; __device__ MYSIZE_T znew( MYSIZE_T seed )
+; {
+;     return 36969 * ( seed & 65535 ) + seed >> 16;
+; }
+;
+; __device__ MYSIZE_T rrand( MYSIZE_T *seed )
+; {
+;     *seed = znew( *seed );
+;     return *seed;
+; }
+;
+; __device__ GridOpt loopOpt;
+;
+; __global__ void cnpWideLaunch(GridOpt opt, MYSIZE_T maxLaunches, MYSIZE_T randomSeed, int *status, int *launchCounts)
+; {
+;     // MYSIZE_T threadSeed = randomSeed + blockDim.x * blockIdx.x + threadIdx.x;
+;     MYSIZE_T blockSeed  = randomSeed + blockIdx.x;
+;
+;     // this device launch consumes a launch slot
+;     maxLaunches--;
+;
+;     // compute number of launches per block
+;     MYSIZE_T blockLaunches = maxLaunches  / gridDim.x;
+;     MYSIZE_T extraLaunches = maxLaunches - gridDim.x * blockLaunches;
+;     if (blockIdx.x < extraLaunches) {
+;         blockLaunches++;
+;     }
+;
+;     // clear launchcount with a thread in each block
+;     if (threadIdx.x == 0)
+;         launchCounts[blockIdx.x] = 0;
+;
+;     // compute per block random selections for sync/launch size/stream
+;     for (MYSIZE_T i = 0; i < opt.itr; i++) {
+;         loopOpt = opt;
+;         if (opt.launch == LaunchOptRand) {
+;             loopOpt.launch = rrand(&blockSeed) % (LaunchOptSize - 1);
+;         }
+;         if (opt.sync == SyncOptRand) {
+;             loopOpt.sync = rrand(&blockSeed) % (SyncOptSize - 1);
+;         }
+;         if (opt.size == SizeOptRand) {
+;             loopOpt.size = rrand(&blockSeed) % (SizeOptSize - 1);
+;         }
+;         __syncthreads();
+;         if (threadIdx.x == 0) {
+;             //printf("block %d launchCount %d blockLaunches:%d\n", blockIdx.x, launchCounts[blockIdx.x], blockLaunches);
+;             int launchCount = launchCounts[blockIdx.x];
+;
+;             // fail if block did not generate enough launches
+;             if (!*status && (launchCount != blockLaunches)) {
+;                 *status = DeviceHardwareFailure;
+;             }
+;             // clear last iteration's launch
+;             launchCounts[blockIdx.x] = 0;
+;         }
+;         __syncthreads();
+;
+;         // fail if a global error is present
+;         if (*status) {
+;             return;
+;         }
+;     }
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 84
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 84
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 87
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 87
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 90
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 90
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[RRANDNAME]]:
+; CHECK-NEXT: // {{.*}} _Z5rrandPy
+; CHECK: [[ZNEWNAME]]:
+; CHECK-NEXT: // {{.*}} _Z4znewy
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.GridOpt = type { i32, i32, i32, i64 }
+
+ at loopOpt = internal addrspace(1) global %struct.GridOpt zeroinitializer, align 8
+ at llvm.used = appending global [2 x ptr] [ptr @_Z13cnpWideLaunch7GridOptyyPiS0_, ptr addrspacecast (ptr addrspace(1) @loopOpt to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline convergent mustprogress norecurse nounwind willreturn
+define void @_Z13cnpWideLaunch7GridOptyyPiS0_(%struct.GridOpt noundef %opt, i64 noundef %maxLaunches, i64 noundef %randomSeed, ptr noundef captures(none) %status, ptr noundef captures(none) %launchCounts) #0 !dbg !4 {
+entry:
+  %0 = addrspacecast ptr %status to ptr addrspace(1)
+  %1 = addrspacecast ptr %launchCounts to ptr addrspace(1)
+  %opt.fca.0.extract = extractvalue %struct.GridOpt %opt, 0
+  %opt.fca.1.extract = extractvalue %struct.GridOpt %opt, 1
+  %opt.fca.2.extract = extractvalue %struct.GridOpt %opt, 2
+  %opt.fca.3.extract = extractvalue %struct.GridOpt %opt, 3
+  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !6
+  %conv = zext nneg i32 %2 to i64, !dbg !6
+  %add = add i64 %randomSeed, %conv, !dbg !6
+  %dec = add i64 %maxLaunches, -1, !dbg !8
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !dbg !9
+  %conv6 = zext nneg i32 %3 to i64, !dbg !9
+  %div = udiv i64 %dec, %conv6, !dbg !9
+  %mul = mul i64 %div, %conv6, !dbg !10
+  %sub = sub i64 %dec, %mul, !dbg !10
+  %cmp = icmp ugt i64 %sub, %conv, !dbg !11
+  %inc = zext i1 %cmp to i64, !dbg !11
+  %spec.select = add i64 %div, %inc, !dbg !11
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12
+  %cmp20 = icmp eq i32 %4, 0, !dbg !12
+  br i1 %cmp20, label %if.then22, label %if.end26, !dbg !12
+
+if.then22:                                        ; preds = %entry
+  %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv, !dbg !13
+  store i32 0, ptr addrspace(1) %getElem, align 4, !dbg !13
+  br label %if.end26, !dbg !13
+
+if.end26:                                         ; preds = %if.then22, %entry
+  %cmp3021.not = icmp eq i64 %opt.fca.3.extract, 0, !dbg !15
+  br i1 %cmp3021.not, label %return, label %for.body.preheader, !dbg !15
+
+for.body.preheader:                               ; preds = %if.end26
+  %cmp34 = icmp eq i32 %opt.fca.0.extract, 5
+  %cmp41 = icmp eq i32 %opt.fca.1.extract, 2
+  %cmp50 = icmp eq i32 %opt.fca.2.extract, 3
+  %getElem11 = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv
+  br label %for.body, !dbg !16
+
+for.body:                                         ; preds = %for.body.preheader, %if.end83
+  %i.023 = phi i64 [ %inc90, %if.end83 ], [ 0, %for.body.preheader ]
+  %blockSeed.022 = phi i64 [ %blockSeed.3, %if.end83 ], [ %add, %for.body.preheader ]
+  %opt.elt = extractvalue %struct.GridOpt %opt, 0, !dbg !20
+  %opt.elt25 = extractvalue %struct.GridOpt %opt, 1, !dbg !20
+  %5 = insertelement <2 x i32> poison, i32 %opt.elt, i64 0, !dbg !20
+  %6 = insertelement <2 x i32> %5, i32 %opt.elt25, i64 1, !dbg !20
+  store <2 x i32> %6, ptr addrspace(1) @loopOpt, align 8, !dbg !20
+  %opt.elt26 = extractvalue %struct.GridOpt %opt, 2, !dbg !20
+  store i32 %opt.elt26, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !20
+  %opt.elt27 = extractvalue %struct.GridOpt %opt, 3, !dbg !20
+  store i64 %opt.elt27, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 16), align 8, !dbg !20
+  br i1 %cmp34, label %if.then36, label %if.end38, !dbg !16
+
+if.then36:                                        ; preds = %for.body
+  %and.i = and i64 %blockSeed.022, 65535, !dbg !21
+  %mul.i = mul nuw nsw i64 %and.i, 36969, !dbg !21
+  %add.i = add i64 %mul.i, %blockSeed.022, !dbg !21
+  %shr.i = lshr i64 %add.i, 16, !dbg !21
+  %rem = urem i64 %shr.i, 5, !dbg !29
+  %conv37 = trunc nuw nsw i64 %rem to i32, !dbg !29
+  store i32 %conv37, ptr addrspace(1) @loopOpt, align 8, !dbg !29
+  br label %if.end38, !dbg !29
+
+if.end38:                                         ; preds = %if.then36, %for.body
+  %blockSeed.1 = phi i64 [ %shr.i, %if.then36 ], [ %blockSeed.022, %for.body ], !dbg !30
+  br i1 %cmp41, label %if.then43, label %if.end47, !dbg !30
+
+if.then43:                                        ; preds = %if.end38
+  %and.i13 = and i64 %blockSeed.1, 65535, !dbg !31
+  %mul.i14 = mul nuw nsw i64 %and.i13, 36969, !dbg !31
+  %add.i15 = add i64 %mul.i14, %blockSeed.1, !dbg !31
+  %shr.i16 = lshr i64 %add.i15, 16, !dbg !31
+  %7 = trunc i64 %shr.i16 to i32, !dbg !35
+  %conv46 = and i32 %7, 1, !dbg !35
+  store i32 %conv46, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 4), align 4, !dbg !35
+  br label %if.end47, !dbg !35
+
+if.end47:                                         ; preds = %if.then43, %if.end38
+  %blockSeed.2 = phi i64 [ %shr.i16, %if.then43 ], [ %blockSeed.1, %if.end38 ], !dbg !36
+  br i1 %cmp50, label %if.then52, label %if.end56, !dbg !36
+
+if.then52:                                        ; preds = %if.end47
+  %and.i17 = and i64 %blockSeed.2, 65535, !dbg !37
+  %mul.i18 = mul nuw nsw i64 %and.i17, 36969, !dbg !37
+  %add.i19 = add i64 %mul.i18, %blockSeed.2, !dbg !37
+  %shr.i20 = lshr i64 %add.i19, 16, !dbg !37
+  %rem54 = urem i64 %shr.i20, 3, !dbg !41
+  %conv55 = trunc nuw nsw i64 %rem54 to i32, !dbg !41
+  store i32 %conv55, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !41
+  br label %if.end56, !dbg !41
+
+if.end56:                                         ; preds = %if.then52, %if.end47
+  %blockSeed.3 = phi i64 [ %shr.i20, %if.then52 ], [ %blockSeed.2, %if.end47 ], !dbg !42
+  tail call void @llvm.nvvm.barrier0(), !dbg !42
+  br i1 %cmp20, label %if.then61, label %if.end83, !dbg !43
+
+if.then61:                                        ; preds = %if.end56
+  %tmp67 = load i32, ptr addrspace(1) %getElem11, align 4, !dbg !44
+  %tmp69 = load i32, ptr addrspace(1) %0, align 4, !dbg !46
+  %tobool.not = icmp eq i32 %tmp69, 0, !dbg !46
+  %conv71 = sext i32 %tmp67 to i64, !dbg !46
+  %cmp73 = icmp ne i64 %spec.select, %conv71, !dbg !46
+  %or.cond = select i1 %tobool.not, i1 %cmp73, i1 false, !dbg !46
+  br i1 %or.cond, label %if.then75, label %if.end77, !dbg !46
+
+if.then75:                                        ; preds = %if.then61
+  store i32 3, ptr addrspace(1) %0, align 4, !dbg !47
+  br label %if.end77, !dbg !47
+
+if.end77:                                         ; preds = %if.then61, %if.then75
+  store i32 0, ptr addrspace(1) %getElem11, align 4, !dbg !49
+  br label %if.end83, !dbg !49
+
+if.end83:                                         ; preds = %if.end77, %if.end56
+  tail call void @llvm.nvvm.barrier0(), !dbg !50
+  %tmp85 = load i32, ptr addrspace(1) %0, align 4, !dbg !51
+  %tobool86.not = icmp eq i32 %tmp85, 0, !dbg !51
+  %inc90 = add nuw i64 %i.023, 1
+  %cmp30 = icmp ult i64 %inc90, %opt.fca.3.extract
+  %or.cond24 = select i1 %tobool86.not, i1 %cmp30, i1 false, !dbg !51
+  br i1 %or.cond24, label %for.body, label %return, !dbg !51, !llvm.loop !52
+
+return:                                           ; preds = %if.end83, %if.end26
+  ret void, !dbg !54
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { alwaysinline convergent mustprogress norecurse nounwind willreturn "target-cpu"="sm_75" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t6.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "cnpWideLaunch", linkageName: "_Z13cnpWideLaunch7GridOptyyPiS0_", scope: !1, file: !1, line: 61, type: !5, scopeLine: 61, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 64, column: 5, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 62, column: 1)
+!8 = !DILocation(line: 67, column: 5, scope: !7)
+!9 = !DILocation(line: 70, column: 5, scope: !7)
+!10 = !DILocation(line: 71, column: 5, scope: !7)
+!11 = !DILocation(line: 72, column: 5, scope: !7)
+!12 = !DILocation(line: 77, column: 5, scope: !7)
+!13 = !DILocation(line: 78, column: 9, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !7, file: !1, line: 77, column: 5)
+!15 = !DILocation(line: 81, column: 5, scope: !7)
+!16 = !DILocation(line: 83, column: 9, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 81, column: 5)
+!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 81, column: 5)
+!19 = distinct !DILexicalBlock(scope: !7, file: !1, line: 81, column: 5)
+!20 = !DILocation(line: 82, column: 9, scope: !17)
+!21 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 49, column: 1)
+!23 = distinct !DISubprogram(name: "znew", linkageName: "_Z4znewy", scope: !1, file: !1, line: 48, type: !5, scopeLine: 48, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!24 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 54, column: 1)
+!26 = distinct !DISubprogram(name: "rrand", linkageName: "_Z5rrandPy", scope: !1, file: !1, line: 53, type: !5, scopeLine: 53, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 84, column: 13, scope: !28)
+!28 = distinct !DILexicalBlock(scope: !17, file: !1, line: 83, column: 9)
+!29 = !DILocation(line: 84, column: 13, scope: !28)
+!30 = !DILocation(line: 86, column: 9, scope: !17)
+!31 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !32)
+!32 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !33)
+!33 = distinct !DILocation(line: 87, column: 13, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !17, file: !1, line: 86, column: 9)
+!35 = !DILocation(line: 87, column: 13, scope: !34)
+!36 = !DILocation(line: 89, column: 9, scope: !17)
+!37 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !38)
+!38 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !39)
+!39 = distinct !DILocation(line: 90, column: 13, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !17, file: !1, line: 89, column: 9)
+!41 = !DILocation(line: 90, column: 13, scope: !40)
+!42 = !DILocation(line: 92, column: 9, scope: !17)
+!43 = !DILocation(line: 93, column: 9, scope: !17)
+!44 = !DILocation(line: 95, column: 13, scope: !45)
+!45 = distinct !DILexicalBlock(scope: !17, file: !1, line: 93, column: 9)
+!46 = !DILocation(line: 98, column: 13, scope: !45)
+!47 = !DILocation(line: 99, column: 17, scope: !48)
+!48 = distinct !DILexicalBlock(scope: !45, file: !1, line: 98, column: 13)
+!49 = !DILocation(line: 102, column: 13, scope: !45)
+!50 = !DILocation(line: 104, column: 9, scope: !17)
+!51 = !DILocation(line: 107, column: 9, scope: !17)
+!52 = distinct !{!52, !53}
+!53 = !{!"llvm.loop.mustprogress"}
+!54 = !DILocation(line: 111, column: 1, scope: !7)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
new file mode 100644
index 0000000000000..9a76fe7ac4a1f
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc < %s -mattr=+ptx70 | FileCheck %s --check-prefix=PTXVERSION
+; RUN: llc < %s -mattr=+ptx72 --line-info-inlined-at=false | FileCheck %s --check-prefix=NOFLAG
+;
+;; Test command-line flags to control inlined_at emission - verifies that PTX 7.2+ emits
+;; function_name and inlined_at by default, while PTX 7.0 or --line-info-inlined-at=false disables it.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+;     calculate();
+; }
+;
+; __device__ void bar() {
+;   if (gg > 17)
+;     foo();
+;     calculate();
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; DEFAULT: .loc [[FILENUM:[1-9]]] 10
+; DEFAULT: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .section .debug_str
+; DEFAULT: {
+; DEFAULT: [[BARNAME]]:
+; DEFAULT-NEXT: // {{.*}} _Z3barv
+; DEFAULT: }
+
+; NOFLAG-NOT: function_name
+; NOFLAG-NOT: inlined_at {{[1-9]}}
+; NOFLAG-NOT: .section .debug_str
+
+; PTXVERSION-NOT: function_name
+; PTXVERSION-NOT: inlined_at {{[1-9]}}
+; PTXVERSION-NOT: .section .debug_str
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: mustprogress willreturn
+define internal fastcc void @_Z3foov() unnamed_addr #0 !dbg !4 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+  br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then:                                          ; preds = %entry
+  %cmp.i = icmp samesign ugt i32 %tmp, 17, !dbg !8
+  br i1 %cmp.i, label %if.then.i, label %_Z3barv.exit, !dbg !8
+
+if.then.i:                                        ; preds = %if.then
+  tail call fastcc void @_Z3foov(), !dbg !13
+  br label %_Z3barv.exit, !dbg !13
+
+_Z3barv.exit:                                     ; preds = %if.then, %if.then.i
+  tail call void @_Z9calculatev(), !dbg !15
+  br label %if.end, !dbg !16
+
+if.end:                                           ; preds = %_Z3barv.exit, %entry
+  tail call void @_Z9calculatev(), !dbg !17
+  ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr #1
+
+; Function Attrs: alwaysinline mustprogress willreturn
+define void @_Z6kernelv() #2 !dbg !19 {
+entry:
+  tail call fastcc void @_Z3foov(), !dbg !20
+  ret void, !dbg !22
+}
+
+attributes #0 = { mustprogress willreturn "target-cpu"="sm_75" }
+attributes #1 = { "target-cpu"="sm_75" }
+attributes #2 = { alwaysinline mustprogress willreturn "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t7.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 15, column: 3, scope: !9, inlinedAt: !11)
+!9 = distinct !DILexicalBlock(scope: !10, file: !1, line: 14, column: 29)
+!10 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = distinct !DILocation(line: 10, column: 5, scope: !12)
+!12 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!13 = !DILocation(line: 16, column: 5, scope: !14, inlinedAt: !11)
+!14 = distinct !DILexicalBlock(scope: !9, file: !1, line: 15, column: 3)
+!15 = !DILocation(line: 17, column: 3, scope: !9, inlinedAt: !11)
+!16 = !DILocation(line: 10, column: 5, scope: !12)
+!17 = !DILocation(line: 11, column: 3, scope: !7)
+!18 = !DILocation(line: 12, column: 1, scope: !7)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!20 = !DILocation(line: 21, column: 3, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!22 = !DILocation(line: 22, column: 1, scope: !21)

>From ca957612c02fe3d972c8bbf6b8a0564ce5f21614 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Fri, 5 Dec 2025 13:11:54 -0800
Subject: [PATCH 2/3] changed hook function names and few comments

---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 15 +++++++--------
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h   | 12 ++++++------
 llvm/lib/Target/NVPTX/CMakeLists.txt       |  2 +-
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp  |  9 +++++----
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h    | 10 +++++-----
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 3fd60d70cfb68..29c09fc526ce1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2204,18 +2204,17 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       Flags |= DWARF2_FLAG_IS_STMT;
   }
 
-  // Call the hook that allows targets to customize source line recording
-  recordSourceLineHook(*MI, DL, Flags);
+  // Call target-specific source line recording.
+  recordTargetSourceLine(*MI, DL, Flags);
 
   // If we're not at line 0, remember this location.
   if (DL.getLine())
     PrevInstLoc = DL;
 }
 
-// Default implementation of target-specific hook for custom source line
-// recording
-void DwarfDebug::recordSourceLineHook(const MachineInstr &MI,
-                                      const DebugLoc &DL, unsigned Flags) {
+// Default implementation of target-specific source line recording.
+void DwarfDebug::recordTargetSourceLine(const MachineInstr &MI,
+                                        const DebugLoc &DL, unsigned Flags) {
   SmallString<128> LocationString;
   if (Asm->OutStreamer->isVerboseAsm()) {
     raw_svector_ostream OS(LocationString);
@@ -2719,8 +2718,8 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(
       getDwarfCompileUnitIDForLineTable(CU));
 
-  // Call target-specific hook for custom initialization
-  beginFunctionHook(*MF);
+  // Call target-specific debug info initialization.
+  initializeTargetDebugInfo(*MF);
 
   // Record beginning of function.
   PrologEndLoc = emitInitialLocDirective(
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 58aeb09645984..25d233025652d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -717,13 +717,13 @@ class DwarfDebug : public DebugHandlerBase {
 
   void skippedNonDebugFunction() override;
 
-  /// Target-specific hook for custom initialization,
-  /// default implementation is empty, only being used for NVPTX target
-  virtual void beginFunctionHook(const MachineFunction &MF) {}
+  /// Target-specific debug info initialization at function start.
+  /// Default implementation is empty, overridden by NVPTX target.
+  virtual void initializeTargetDebugInfo(const MachineFunction &MF) {}
 
-  /// Target-specific hook for custom source line recording
-  virtual void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
-                                    unsigned Flags);
+  /// Target-specific source line recording.
+  virtual void recordTargetSourceLine(const MachineInstr &MI,
+                                      const DebugLoc &DL, unsigned Flags);
 
   const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
     return InfoHolder.getUnits();
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 505e7e945f108..469aaf7685d09 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,13 +18,13 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
+  NVPTXIRPeephole.cpp
   NVPTXDwarfDebug.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
   NVPTXImageOptimizer.cpp
   NVPTXInstrInfo.cpp
-  NVPTXIRPeephole.cpp
   NVPTXISelDAGToDAG.cpp
   NVPTXISelLowering.cpp
   NVPTXLowerAggrCopies.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
index 57410179a6344..d2f95ce28e783 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -151,15 +151,16 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
   }
 }
 
-// NVPTX-specific function initialization hook.
-void NVPTXDwarfDebug::beginFunctionHook(const MachineFunction &MF) {
+// NVPTX-specific debug info initialization.
+void NVPTXDwarfDebug::initializeTargetDebugInfo(const MachineFunction &MF) {
   InlinedAtLocs.clear();
   collectInlinedAtLocations(MF);
 }
 
 // NVPTX-specific source line recording with inlined_at support.
-void NVPTXDwarfDebug::recordSourceLineHook(const MachineInstr &MI,
-                                           const DebugLoc &DL, unsigned Flags) {
+void NVPTXDwarfDebug::recordTargetSourceLine(const MachineInstr &MI,
+                                             const DebugLoc &DL,
+                                             unsigned Flags) {
   // Call NVPTX-specific implementation that handles inlined_at.
   recordSourceLineAndInlinedAt(MI, Flags);
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
index d7032725134e6..9794d613ad9ba 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -36,11 +36,11 @@ class NVPTXDwarfDebug : public DwarfDebug {
   void collectInlinedAtLocations(const MachineFunction &MF);
 
 protected:
-  /// Override hook to collect inlined_at locations.
-  void beginFunctionHook(const MachineFunction &MF) override;
-  /// Override hook to record source line information with inlined_at support.
-  void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
-                            unsigned Flags) override;
+  /// Override to collect inlined_at locations.
+  void initializeTargetDebugInfo(const MachineFunction &MF) override;
+  /// Override to record source line information with inlined_at support.
+  void recordTargetSourceLine(const MachineInstr &MI, const DebugLoc &DL,
+                              unsigned Flags) override;
 
 private:
   /// NVPTX-specific source line recording with inlined_at support.

>From ee6758a3e2bc794f43965e5879a80ac488fce75e Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Thu, 11 Dec 2025 18:46:13 -0800
Subject: [PATCH 3/3] Code refactor to remove scanning pass and track only the
 emitted locs

---
 llvm/include/llvm/MC/MCStreamer.h         |  2 +-
 llvm/lib/MC/MCAsmStreamer.cpp             |  2 +-
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp | 51 +++++------------------
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h   |  8 ++--
 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll  |  4 --
 5 files changed, 16 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 6fe858c3201eb..6f1b3d5ccb2cd 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -902,7 +902,7 @@ class LLVM_ABI MCStreamer {
                                      StringRef FileName,
                                      StringRef Comment = {});
 
-  /// This is same as emitDwarfLocDirective, except has capability to
+  /// This is same as emitDwarfLocDirective, except it has the capability to
   /// add inlined_at information.
   virtual void emitDwarfLocDirectiveWithInlinedAt(
       unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index bdfaff0a22417..18a61ea7c9e00 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -1739,7 +1739,7 @@ void MCAsmStreamer::emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
       OS << Comment;
   }
 
-  // Emit end of line and update parent state
+  // Emit end of line and update the baseclass state
   EmitEOL();
   MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
                                     Discriminator, FileName, Comment);
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
index d2f95ce28e783..f5a674415cb28 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -35,36 +35,6 @@ static cl::opt<bool> LineInfoWithInlinedAt(
 
 NVPTXDwarfDebug::NVPTXDwarfDebug(AsmPrinter *A) : DwarfDebug(A) {}
 
-// Collect all inlined_at locations for the current function.
-void NVPTXDwarfDebug::collectInlinedAtLocations(const MachineFunction &MF) {
-  const DISubprogram *SP = MF.getFunction().getSubprogram();
-  assert(SP && "expecting valid subprogram here");
-
-  // inlined_at support requires PTX 7.2 or later.
-  const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
-  if (STI.getPTXVersion() < 72)
-    return;
-
-  if (!(SP->getUnit()->isDebugDirectivesOnly() ||
-        SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly) ||
-      !LineInfoWithInlinedAt) // No enhanced lineinfo, we are done.
-    return;
-
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      const DebugLoc &DL = MI.getDebugLoc();
-      if (!DL)
-        continue;
-      const DILocation *InlinedAt = DL.getInlinedAt();
-      while (InlinedAt) {
-        if (!InlinedAtLocs.insert(InlinedAt).second)
-          break;
-        InlinedAt = InlinedAt->getInlinedAt();
-      }
-    }
-  }
-}
-
 // NVPTX-specific source line recording with inlined_at support.
 void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
                                                    unsigned Flags) {
@@ -73,7 +43,7 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
   // inlined_at directive, we might need to emit additional .loc prior
   // to it for the location contained in the inlined_at.
   SmallVector<const DILocation *, 8> WorkList;
-  DenseSet<const DILocation *> WorkListSet;
+  SmallDenseSet<const DILocation *, 8> WorkListSet;
   const DILocation *EmitLoc = DL.get();
 
   const DISubprogram *SP = MI.getMF()->getFunction().getSubprogram();
@@ -101,9 +71,10 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
       break;
 
     const DILocation *IA = EmitLoc->getInlinedAt();
-    // Check if this has inlined_at information, and if we have not yet
-    // emitted the .loc for the inlined_at location.
-    if (IA && InlinedAtLocs.contains(IA))
+    // Check if this has inlined_at information, and if the parent location
+    // has not yet been emitted. If already emitted, we don't need to
+    // re-emit the parent chain.
+    if (IA && !EmittedInlinedAtLocs.contains(IA))
       EmitLoc = IA;
     else // We are done
       break;
@@ -129,10 +100,6 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
 
     const unsigned FileNo = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
                                 .getOrCreateSourceID(Scope->getFile());
-    // Remove this location from the work list if it is in the inlined_at
-    // locations set.
-    if (EnhancedLineinfo && InlinedAtLocs.contains(Current))
-      InlinedAtLocs.erase(Current);
 
     if (EnhancedLineinfo && InlinedAt) {
       const unsigned FileIA = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
@@ -148,13 +115,17 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
       Asm->OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
                                               Discriminator, Fn);
     }
+    // Mark this location as emitted so we don't re-emit the parent chain
+    // for subsequent instructions that share the same inlined_at parent.
+    if (EnhancedLineinfo)
+      EmittedInlinedAtLocs.insert(Current);
   }
 }
 
 // NVPTX-specific debug info initialization.
 void NVPTXDwarfDebug::initializeTargetDebugInfo(const MachineFunction &MF) {
-  InlinedAtLocs.clear();
-  collectInlinedAtLocations(MF);
+  // Clear the set of emitted inlined_at locations for each new function.
+  EmittedInlinedAtLocs.clear();
 }
 
 // NVPTX-specific source line recording with inlined_at support.
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
index 9794d613ad9ba..b7cfd5faa36f3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -25,16 +25,14 @@ namespace llvm {
 /// inlined_at support.
 class NVPTXDwarfDebug : public DwarfDebug {
 private:
-  /// Set of InlinedAt locations, used to track if these have been emitted.
-  DenseSet<const DILocation *> InlinedAtLocs;
+  /// Set of inlined_at locations that have already been emitted.
+  /// Used to avoid redundant emission of parent chain .loc directives.
+  DenseSet<const DILocation *> EmittedInlinedAtLocs;
 
 public:
   /// Constructor - Pass through to DwarfDebug constructor.
   NVPTXDwarfDebug(AsmPrinter *A);
 
-  /// Collect all inlined_at locations for the current function.
-  void collectInlinedAtLocations(const MachineFunction &MF);
-
 protected:
   /// Override to collect inlined_at locations.
   void initializeTargetDebugInfo(const MachineFunction &MF) override;
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
index 57df39e8034c4..31a17dc810b4a 100644
--- a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
@@ -165,15 +165,11 @@ entry:
 attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
 
 !llvm.dbg.cu = !{!0}
-!nvvmir.version = !{!2, !3, !4}
 !nvvm.annotations = !{!5, !6, !7, !8}
 !llvm.module.flags = !{!9}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
 !1 = !DIFile(filename: "t4.cu", directory: "")
-!2 = !{i32 2, i32 0, i32 3, i32 2}
-!3 = !{i32 2, i32 0}
-!4 = !{i32 2, i32 0, i32 3, i32 1}
 !5 = !{ptr @_Z7kernel1ii, !"kernel", i32 1}
 !6 = !{ptr @_Z7kernel2ii, !"kernel", i32 1}
 !7 = !{ptr @_Z7kernel3ii, !"kernel", i32 1}



More information about the llvm-commits mailing list