[llvm] [DebugInfo][NVPTX] Adding support for `inlined_at` debug directive in NVPTX backend (PR #170239)

Laxman Sole via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 1 18:04:57 PST 2025


https://github.com/laxmansole created https://github.com/llvm/llvm-project/pull/170239

This change adds support for emitting the enhanced PTX debugging directives `function_name` and `inlined_at` as part of the `.loc` directive in the NVPTX backend.

`.loc` syntax - 
>.loc file_index line_number column_position

`.loc` syntax with `inlined_at` attribute - 
>.loc file_index line_number column_position,function_name label {+ immediate }, inlined_at file_index2 line_number2 column_position2

`inlined_at` attribute specified as part of the `.loc` directive indicates PTX instructions that are generated from a function that got inlined. It specifies the source location at which the specified function is inlined. `file_index2`, `line_number2`, and `column_position2` specify the location at which the function is inlined.

The `function_name` attribute specifies an offset in the DWARF section- `.debug_str`. Offset is specified as a label expression or a label + immediate expression, where label is defined in the `.debug_str` section. DWARF section `.debug_str` contains ASCII null-terminated strings that specify the name of the function that is inlined.

These attributes were introduced in PTX ISA version 7.2 (see NVIDIA’s documentation: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#debugging-directives-loc ).

To support these features, the PR introduces a new NVPTXDwarfDebug class derived from DwarfDebug, which implements NVPTX-specific logic for emitting these directives. The base DwarfDebug infrastructure is extended with new virtual hooks (beginFunctionHook() and recordSourceLineHook()) that enable the NVPTX backend to generate this additional debug information.

The MC layer is also updated to emit the NVPTX-specific `.loc` attributes (function_name and inlined_at). The implementation applies to PTX ISA 7.2 and later when the debug-info emission kind is either lineTableOnly or DebugDirectiveOnly. A new command-line option, `--line-info-inlined-at=<true/false>`, is added to control whether the inlined_at attribute is generated.

Note - The `NVCC` compiler already emits the `.loc` directive with `inlined_at` when compiled with `-lineinfo` option. 

>From edb746059fc03545e02d51557854e089f702f1c5 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Wed, 26 Nov 2025 15:55:11 -0800
Subject: [PATCH] Adding support for inlined_at debug directive in NVPTX
 backend

---
 llvm/include/llvm/CodeGen/AsmPrinter.h        |   4 +
 llvm/include/llvm/MC/MCStreamer.h             |   8 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |   4 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    |  19 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h      |  21 +-
 .../CodeGen/AsmPrinter/DwarfStringPool.cpp    |   7 +-
 llvm/lib/MC/MCAsmStreamer.cpp                 | 123 +++++--
 llvm/lib/Target/NVPTX/CMakeLists.txt          |   1 +
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     |   5 +
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h       |   3 +
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp     | 165 +++++++++
 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h       |  52 +++
 llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll      | 144 ++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll      |  85 +++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll      | 305 ++++++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll      | 229 ++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll      | 208 +++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll      | 343 ++++++++++++++++++
 llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll      | 118 ++++++
 19 files changed, 1800 insertions(+), 44 deletions(-)
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
 create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 311f7df98cf8c..d01d93750e806 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -277,6 +277,10 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass {
   AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer,
              char &ID = AsmPrinter::ID);
 
+  /// Create the DwarfDebug handler. Targets can override this to provide
+  /// custom debug information handling.
+  virtual DwarfDebug *createDwarfDebug();
+
 public:
   ~AsmPrinter() override;
 
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 79c715e3820a6..6fe858c3201eb 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -902,6 +902,14 @@ class LLVM_ABI MCStreamer {
                                      StringRef FileName,
                                      StringRef Comment = {});
 
+  /// This is same as emitDwarfLocDirective, except has capability to
+  /// add inlined_at information.
+  virtual void emitDwarfLocDirectiveWithInlinedAt(
+      unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+      unsigned LineIA, unsigned ColumnIA, const MCSymbol *Sym, unsigned Flags,
+      unsigned Isa, unsigned Discriminator, StringRef FileName,
+      StringRef Comment = {}) {}
+
   /// This implements the '.loc_label Name' directive.
   virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 3aa245b7f3f1e..1a0a0cfd8f6ac 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -470,6 +470,8 @@ const MCSection *AsmPrinter::getCurrentSection() const {
   return OutStreamer->getCurrentSectionOnly();
 }
 
+DwarfDebug *AsmPrinter::createDwarfDebug() { return new DwarfDebug(this); }
+
 void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -592,7 +594,7 @@ bool AsmPrinter::doInitialization(Module &M) {
       Handlers.push_back(std::make_unique<CodeViewDebug>(this));
     if (!EmitCodeView || M.getDwarfVersion()) {
       if (hasDebugInfo()) {
-        DD = new DwarfDebug(this);
+        DD = createDwarfDebug();
         Handlers.push_back(std::unique_ptr<DwarfDebug>(DD));
       }
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 40bfea059c707..3fd60d70cfb68 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2204,13 +2204,27 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
       Flags |= DWARF2_FLAG_IS_STMT;
   }
 
-  RecordSourceLine(DL, Flags);
+  // Call the hook that allows targets to customize source line recording
+  recordSourceLineHook(*MI, DL, Flags);
 
   // If we're not at line 0, remember this location.
   if (DL.getLine())
     PrevInstLoc = DL;
 }
 
+// Default implementation of target-specific hook for custom source line
+// recording
+void DwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+                                      const DebugLoc &DL, unsigned Flags) {
+  SmallString<128> LocationString;
+  if (Asm->OutStreamer->isVerboseAsm()) {
+    raw_svector_ostream OS(LocationString);
+    DL.print(OS);
+  }
+  recordSourceLine(DL.getLine(), DL.getCol(), DL.getScope(), Flags,
+                   LocationString);
+}
+
 // Returns the position where we should place prologue_end, potentially nullptr,
 // which means "no good place to put prologue_end". Returns true in the second
 // return value if there are no setup instructions in this function at all,
@@ -2705,6 +2719,9 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(
       getDwarfCompileUnitIDForLineTable(CU));
 
+  // Call target-specific hook for custom initialization
+  beginFunctionHook(*MF);
+
   // Record beginning of function.
   PrologEndLoc = emitInitialLocDirective(
       *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1a1b28a6fc035..58aeb09645984 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -394,9 +394,6 @@ class DwarfDebug : public DebugHandlerBase {
   /// table for the same directory as DW_AT_comp_dir.
   StringRef CompilationDir;
 
-  /// Holder for the file specific debug information.
-  DwarfFile InfoHolder;
-
   /// Holders for the various debug information flags that we might need to
   /// have exposed. See accessor functions below for description.
 
@@ -532,10 +529,6 @@ class DwarfDebug : public DebugHandlerBase {
 
   MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
 
-  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
-    return InfoHolder.getUnits();
-  }
-
   using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
 
   void ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
@@ -711,6 +704,8 @@ class DwarfDebug : public DebugHandlerBase {
   void computeKeyInstructions(const MachineFunction *MF);
 
 protected:
+  /// Holder for the file specific debug information.
+  DwarfFile InfoHolder;
   /// Gather pre-function debug information.
   void beginFunctionImpl(const MachineFunction *MF) override;
 
@@ -722,6 +717,18 @@ class DwarfDebug : public DebugHandlerBase {
 
   void skippedNonDebugFunction() override;
 
+  /// Target-specific hook for custom initialization,
+  /// default implementation is empty, only being used for NVPTX target
+  virtual void beginFunctionHook(const MachineFunction &MF) {}
+
+  /// Target-specific hook for custom source line recording
+  virtual void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+                                    unsigned Flags);
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return InfoHolder.getUnits();
+  }
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index d304c7efe2a75..4ccad65bdba91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -96,9 +96,12 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
     if (ShouldCreateSymbols)
       Asm.OutStreamer->emitLabel(Entry->getValue().Symbol);
 
+    // Emit a comment with the string offset and the string itself.
+    Asm.OutStreamer->AddComment(
+        "string offset=" + Twine(Entry->getValue().Offset) + " ; " +
+        StringRef(Entry->getKeyData(), Entry->getKeyLength()));
+
     // Emit the string itself with a terminating null byte.
-    Asm.OutStreamer->AddComment("string offset=" +
-                                Twine(Entry->getValue().Offset));
     Asm.OutStreamer->emitBytes(
         StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
   }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 885fa55b65d50..b46ec73a4b012 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -70,6 +70,17 @@ class MCAsmStreamer final : public MCStreamer {
   void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
   void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
 
+  /// Helper to emit common .loc directive flags, isa, and discriminator
+  void emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+                                  unsigned Discriminator);
+
+  /// Helper to emit the common suffix of .loc directives (flags, comment, EOL,
+  /// parent call)
+  void emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+                                   unsigned Column, unsigned Flags,
+                                   unsigned Isa, unsigned Discriminator,
+                                   StringRef FileName, StringRef Comment);
+
 public:
   MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
                 std::unique_ptr<MCInstPrinter> printer,
@@ -297,6 +308,14 @@ class MCAsmStreamer final : public MCStreamer {
                              StringRef Location = {}) override;
   void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
 
+  void emitDwarfLocDirectiveWithInlinedAt(unsigned FileNo, unsigned Line,
+                                          unsigned Column, unsigned FileIA,
+                                          unsigned LineIA, unsigned ColIA,
+                                          const MCSymbol *Sym, unsigned Flags,
+                                          unsigned Isa, unsigned Discriminator,
+                                          StringRef FileName,
+                                          StringRef Comment = {}) override;
+
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
   bool emitCVFileDirective(unsigned FileNo, StringRef Filename,
@@ -1675,6 +1694,57 @@ void MCAsmStreamer::emitDwarfFile0Directive(
     emitRawText(OS1.str());
 }
 
+/// Helper to emit common .loc directive flags, isa, and discriminator
+void MCAsmStreamer::emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+                                               unsigned Discriminator) {
+  if (!MAI->supportsExtendedDwarfLocDirective())
+    return;
+
+  if (Flags & DWARF2_FLAG_BASIC_BLOCK)
+    OS << " basic_block";
+  if (Flags & DWARF2_FLAG_PROLOGUE_END)
+    OS << " prologue_end";
+  if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
+    OS << " epilogue_begin";
+
+  const unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
+  if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
+    OS << " is_stmt ";
+    OS << ((Flags & DWARF2_FLAG_IS_STMT) ? "1" : "0");
+  }
+
+  if (Isa)
+    OS << " isa " << Isa;
+  if (Discriminator)
+    OS << " discriminator " << Discriminator;
+}
+
+/// Helper to emit the common suffix of .loc directives
+void MCAsmStreamer::emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+                                                unsigned Column, unsigned Flags,
+                                                unsigned Isa,
+                                                unsigned Discriminator,
+                                                StringRef FileName,
+                                                StringRef Comment) {
+  // Emit flags, isa, and discriminator
+  emitDwarfLocDirectiveFlags(Flags, Isa, Discriminator);
+
+  // Emit verbose comment if enabled
+  if (IsVerboseAsm) {
+    OS.PadToColumn(MAI->getCommentColumn());
+    OS << MAI->getCommentString() << ' ';
+    if (Comment.empty())
+      OS << FileName << ':' << Line << ':' << Column;
+    else
+      OS << Comment;
+  }
+
+  // Emit end of line and update parent state
+  EmitEOL();
+  MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
+                                    Discriminator, FileName, Comment);
+}
+
 void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
                                           unsigned Isa, unsigned Discriminator,
@@ -1691,42 +1761,29 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
     return;
   }
 
+  // Emit the basic .loc directive
   OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
-  if (MAI->supportsExtendedDwarfLocDirective()) {
-    if (Flags & DWARF2_FLAG_BASIC_BLOCK)
-      OS << " basic_block";
-    if (Flags & DWARF2_FLAG_PROLOGUE_END)
-      OS << " prologue_end";
-    if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
-      OS << " epilogue_begin";
-
-    unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
-    if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
-      OS << " is_stmt ";
-
-      if (Flags & DWARF2_FLAG_IS_STMT)
-        OS << "1";
-      else
-        OS << "0";
-    }
 
-    if (Isa)
-      OS << " isa " << Isa;
-    if (Discriminator)
-      OS << " discriminator " << Discriminator;
-  }
+  // Emit common suffix (flags, comment, EOL, parent call)
+  emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+                              FileName, Comment);
+}
 
-  if (IsVerboseAsm) {
-    OS.PadToColumn(MAI->getCommentColumn());
-    OS << MAI->getCommentString() << ' ';
-    if (Comment.empty())
-      OS << FileName << ':' << Line << ':' << Column;
-    else
-      OS << Comment;
-  }
-  EmitEOL();
-  this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
-                                          Discriminator, FileName, Comment);
+/// This is same as emitDwarfLocDirective, except also emits inlined function
+/// and inlined callsite information.
+void MCAsmStreamer::emitDwarfLocDirectiveWithInlinedAt(
+    unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+    unsigned LineIA, unsigned ColIA, const MCSymbol *Sym, unsigned Flags,
+    unsigned Isa, unsigned Discriminator, StringRef FileName,
+    StringRef Comment) {
+  // Emit the basic .loc directive with NVPTX-specific extensions
+  OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
+  OS << ", function_name " << *Sym;
+  OS << ", inlined_at " << FileIA << " " << LineIA << " " << ColIA;
+
+  // Emit common suffix (flags, comment, EOL, parent call)
+  emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+                              FileName, Comment);
 }
 
 void MCAsmStreamer::emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) {
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index f9c24750c4836..b96a674d74e91 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
+  NVPTXDwarfDebug.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9bbb3aad89c44..d5c9988d0eb53 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "MCTargetDesc/NVPTXTargetStreamer.h"
 #include "NVPTX.h"
+#include "NVPTXDwarfDebug.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXRegisterInfo.h"
@@ -676,6 +677,10 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
   OutStreamer->emitRawText(OS1.str());
 }
 
+DwarfDebug *NVPTXAsmPrinter::createDwarfDebug() {
+  return new NVPTXDwarfDebug(this);
+}
+
 bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index f35931868d99f..20c1bbb9e15b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -194,6 +194,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
   bool doInitialization(Module &M) override;
   bool doFinalization(Module &M) override;
 
+  // Create NVPTX-specific DwarfDebug handler
+  DwarfDebug *createDwarfDebug() override;
+
 private:
   bool GlobalsEmitted;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
new file mode 100644
index 0000000000000..57410179a6344
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -0,0 +1,165 @@
+//===-- NVPTXDwarfDebug.cpp - NVPTX DwarfDebug Implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for NVPTX-specific debug information
+// processing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXDwarfDebug.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// Command line option to control inlined_at enhancement to lineinfo support.
+// Valid only when debuginfo emissionkind is DebugDirectivesOnly or
+// LineTablesOnly.
+static cl::opt<bool> LineInfoWithInlinedAt(
+    "line-info-inlined-at",
+    cl::desc("Emit line with inlined_at enhancement for NVPTX"), cl::init(true),
+    cl::Hidden);
+
+NVPTXDwarfDebug::NVPTXDwarfDebug(AsmPrinter *A) : DwarfDebug(A) {}
+
+// Collect all inlined_at locations for the current function.
+void NVPTXDwarfDebug::collectInlinedAtLocations(const MachineFunction &MF) {
+  const DISubprogram *SP = MF.getFunction().getSubprogram();
+  assert(SP && "expecting valid subprogram here");
+
+  // inlined_at support requires PTX 7.2 or later.
+  const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
+  if (STI.getPTXVersion() < 72)
+    return;
+
+  if (!(SP->getUnit()->isDebugDirectivesOnly() ||
+        SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly) ||
+      !LineInfoWithInlinedAt) // No enhanced lineinfo, we are done.
+    return;
+
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      const DebugLoc &DL = MI.getDebugLoc();
+      if (!DL)
+        continue;
+      const DILocation *InlinedAt = DL.getInlinedAt();
+      while (InlinedAt) {
+        if (!InlinedAtLocs.insert(InlinedAt).second)
+          break;
+        InlinedAt = InlinedAt->getInlinedAt();
+      }
+    }
+  }
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
+                                                   unsigned Flags) {
+  const DebugLoc &DL = MI.getDebugLoc();
+  // Maintain a work list of .loc to be emitted. If we are emitting the
+  // inlined_at directive, we might need to emit additional .loc prior
+  // to it for the location contained in the inlined_at.
+  SmallVector<const DILocation *, 8> WorkList;
+  DenseSet<const DILocation *> WorkListSet;
+  const DILocation *EmitLoc = DL.get();
+
+  const DISubprogram *SP = MI.getMF()->getFunction().getSubprogram();
+  const NVPTXSubtarget &STI = MI.getMF()->getSubtarget<NVPTXSubtarget>();
+  const bool EnhancedLineinfo =
+      LineInfoWithInlinedAt && (STI.getPTXVersion() >= 72) && SP &&
+      (SP->getUnit()->isDebugDirectivesOnly() ||
+       SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly);
+
+  while (EmitLoc) {
+    // Get the scope for the current location.
+    const DIScope *Scope = EmitLoc->getScope();
+    if (!Scope)
+      break; // scope is null, we are done.
+
+    // Check if this loc is already in work list, if so, we are done.
+    if (WorkListSet.contains(EmitLoc))
+      break;
+
+    // Add this location to the work list.
+    WorkList.push_back(EmitLoc);
+    WorkListSet.insert(EmitLoc);
+
+    if (!EnhancedLineinfo) // No enhanced lineinfo, we are done.
+      break;
+
+    const DILocation *IA = EmitLoc->getInlinedAt();
+    // Check if this has inlined_at information, and if we have not yet
+    // emitted the .loc for the inlined_at location.
+    if (IA && InlinedAtLocs.contains(IA))
+      EmitLoc = IA;
+    else // We are done
+      break;
+  }
+
+  const unsigned CUID = Asm->OutStreamer->getContext().getDwarfCompileUnitID();
+  // Traverse the work list, and emit .loc.
+  while (!WorkList.empty()) {
+    const DILocation *Current = WorkList.pop_back_val();
+    const DIScope *Scope = Current->getScope();
+
+    if (!Scope)
+      llvm_unreachable("we shouldn't be here for null scope");
+
+    const DILocation *InlinedAt = Current->getInlinedAt();
+    StringRef Fn = Scope->getFilename();
+    const unsigned Line = Current->getLine();
+    const unsigned Col = Current->getColumn();
+    unsigned Discriminator = 0;
+    if (Line != 0 && getDwarfVersion() >= 4)
+      if (const DILexicalBlockFile *LBF = dyn_cast<DILexicalBlockFile>(Scope))
+        Discriminator = LBF->getDiscriminator();
+
+    const unsigned FileNo = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+                                .getOrCreateSourceID(Scope->getFile());
+    // Remove this location from the work list if it is in the inlined_at
+    // locations set.
+    if (EnhancedLineinfo && InlinedAtLocs.contains(Current))
+      InlinedAtLocs.erase(Current);
+
+    if (EnhancedLineinfo && InlinedAt) {
+      const unsigned FileIA = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+                                  .getOrCreateSourceID(InlinedAt->getFile());
+      const DISubprogram *SubProgram = getDISubprogram(Current->getScope());
+      DwarfStringPoolEntryRef Entry = InfoHolder.getStringPool().getEntry(
+          *Asm, SubProgram->getLinkageName());
+      Asm->OutStreamer->emitDwarfLocDirectiveWithInlinedAt(
+          FileNo, Line, Col, FileIA, InlinedAt->getLine(),
+          InlinedAt->getColumn(), Entry.getSymbol(), Flags, 0, Discriminator,
+          Fn);
+    } else {
+      Asm->OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
+                                              Discriminator, Fn);
+    }
+  }
+}
+
+// NVPTX-specific function initialization hook.
+void NVPTXDwarfDebug::beginFunctionHook(const MachineFunction &MF) {
+  InlinedAtLocs.clear();
+  collectInlinedAtLocations(MF);
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+                                           const DebugLoc &DL, unsigned Flags) {
+  // Call NVPTX-specific implementation that handles inlined_at.
+  recordSourceLineAndInlinedAt(MI, Flags);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
new file mode 100644
index 0000000000000..d7032725134e6
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -0,0 +1,52 @@
+//===-- NVPTXDwarfDebug.h - NVPTX DwarfDebug Implementation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares helper classes and functions for NVPTX-specific debug
+// information processing, particularly for inlined function call sites and
+// enhanced line information with inlined_at directives.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+
+#include "../../CodeGen/AsmPrinter/DwarfCompileUnit.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace llvm {
+
+/// NVPTXDwarfDebug - NVPTX-specific DwarfDebug implementation.
+/// Inherits from DwarfDebug to provide enhanced line information with
+/// inlined_at support.
+class NVPTXDwarfDebug : public DwarfDebug {
+private:
+  /// Set of InlinedAt locations, used to track if these have been emitted.
+  DenseSet<const DILocation *> InlinedAtLocs;
+
+public:
+  /// Constructor - Pass through to DwarfDebug constructor.
+  NVPTXDwarfDebug(AsmPrinter *A);
+
+  /// Collect all inlined_at locations for the current function.
+  void collectInlinedAtLocations(const MachineFunction &MF);
+
+protected:
+  /// Override hook to collect inlined_at locations.
+  void beginFunctionHook(const MachineFunction &MF) override;
+  /// Override hook to record source line information with inlined_at support.
+  void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+                            unsigned Flags) override;
+
+private:
+  /// NVPTX-specific source line recording with inlined_at support.
+  void recordSourceLineAndInlinedAt(const MachineInstr &MI, unsigned Flags);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
new file mode 100644
index 0000000000000..4ce97900be584
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test mutual recursion with deep inlining - verifies that inlined_at information
+;; is correctly emitted for multiple levels of inlining when foo() and bar() call each other.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+;   calculate();
+; }
+;
+; __device__ void bar() {
+;   if (gg > 17)
+;     foo();
+;   calculate();
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 21
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 16
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+define internal fastcc void @_Z3foov() unnamed_addr  !dbg !4 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+  br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @_Z3barv(), !dbg !8
+  br label %if.end, !dbg !8
+
+if.end:                                           ; preds = %if.then, %entry
+  tail call void @_Z9calculatev(), !dbg !10
+  ret void, !dbg !11
+}
+
+define internal fastcc void @_Z3barv() unnamed_addr  !dbg !12 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !13
+  %cmp = icmp sgt i32 %tmp, 17, !dbg !13
+  br i1 %cmp, label %if.then, label %if.end, !dbg !13
+
+if.then:                                          ; preds = %entry
+  tail call fastcc void @_Z3foov(), !dbg !15
+  br label %if.end, !dbg !15
+
+if.end:                                           ; preds = %if.then, %entry
+  tail call void @_Z9calculatev(), !dbg !17
+  ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr
+
+; Function Attrs: alwaysinline
+define void @_Z6kernelv() #1 !dbg !19 {
+entry:
+  %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !20
+  %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !20
+  br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !20
+
+if.then.i:                                        ; preds = %entry
+  %cmp.i2 = icmp sgt i32 %tmp.i, 17, !dbg !23
+  br i1 %cmp.i2, label %if.then.i10, label %_Z3barv.exit, !dbg !23
+
+if.then.i10:                                      ; preds = %if.then.i
+  tail call fastcc void @_Z3foov(), !dbg !25
+  tail call void @_Z9calculatev(), !dbg !28
+  tail call void @_Z9calculatev(), !dbg !29
+  br label %_Z3barv.exit, !dbg !30
+
+_Z3barv.exit:                                     ; preds = %if.then.i, %if.then.i10
+  tail call void @_Z9calculatev(), !dbg !31
+  br label %_Z3foov.exit, !dbg !32
+
+_Z3foov.exit:                                     ; preds = %entry, %_Z3barv.exit
+  tail call void @_Z9calculatev(), !dbg !33
+  ret void, !dbg !34
+}
+
+attributes #1 = { alwaysinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t1.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 10, column: 5, scope: !9)
+!9 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!10 = !DILocation(line: 11, column: 3, scope: !7)
+!11 = !DILocation(line: 12, column: 1, scope: !7)
+!12 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!13 = !DILocation(line: 15, column: 3, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !12, file: !1, line: 14, column: 29)
+!15 = !DILocation(line: 16, column: 5, scope: !16)
+!16 = distinct !DILexicalBlock(scope: !14, file: !1, line: 15, column: 3)
+!17 = !DILocation(line: 17, column: 3, scope: !14)
+!18 = !DILocation(line: 18, column: 1, scope: !14)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!20 = !DILocation(line: 9, column: 3, scope: !7, inlinedAt: !21)
+!21 = distinct !DILocation(line: 21, column: 3, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!23 = !DILocation(line: 15, column: 3, scope: !14, inlinedAt: !24)
+!24 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!25 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !26)
+!26 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !27)
+!27 = distinct !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!28 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !26)
+!29 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !27)
+!30 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!31 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !24)
+!32 = !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!33 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !21)
+!34 = !DILocation(line: 22, column: 1, scope: !22)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
new file mode 100644
index 0000000000000..47e4655a3ef54
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test simple two-level inlining - verifies that inlined_at information is correctly
+;; emitted when foo() calls bar() and kernel() calls foo().
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+;
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+; }
+;
+; __device__ void bar() {
+;   ++gg;
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 18
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelv() #0 !dbg !5 {
+entry:
+  %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !7
+  %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !7
+  br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !7
+
+if.then.i:                                        ; preds = %entry
+  %inc.i.i = add nuw nsw i32 %tmp.i, 1, !dbg !12
+  store i32 %inc.i.i, ptr addrspace(1) @gg, align 4, !dbg !12
+  br label %_Z3foov.exit, !dbg !17
+
+_Z3foov.exit:                                     ; preds = %entry, %if.then.i
+  ret void, !dbg !18
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t2.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelv, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 9, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 8, column: 29)
+!9 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 18, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 17, column: 29)
+!12 = !DILocation(line: 14, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 13, column: 29)
+!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 13, type: !6, scopeLine: 13, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!16 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9, column: 3)
+!17 = !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!18 = !DILocation(line: 19, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
new file mode 100644
index 0000000000000..5b94a249ab64d
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test inlining of a C++ constructor with control flow - verifies that inlined_at
+;; information is correctly emitted when a constructor containing loops and conditionals is inlined.
+;
+; __device__ int gg;
+; __device__ int *arr;
+;
+; class C {
+;   int priv;
+;   public: __device__ C();
+;   __device__ C(int);
+;   __device__ int get() const;
+; };
+;
+;
+; __device__ C::C() : priv(1) {
+;   int sum = 0;
+;   for (int i = 0; i < gg; ++i) sum += arr[i];
+;   if (sum > 17)
+;     priv = sum;
+; }
+;
+; __device__ C::C(int n) : priv(n) {}
+;
+; __device__ int C::get() const { return priv; }
+;
+; __global__ void kernel(int n) {
+;   C c1;
+;   if (n > 7)
+;     gg = c1.get();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 24
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[CTORNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[CTORNAME]]:
+; CHECK-NEXT: // {{.*}} _ZN1CC1Ev
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at arr = internal addrspace(1) global ptr null, align 8
+ at llvm.used = appending global [3 x ptr] [ptr @_Z6kerneli, ptr addrspacecast (ptr addrspace(1) @arr to ptr), ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none)
+define void @_Z6kerneli(i32 noundef %n) #0 !dbg !4 {
+entry:
+  %tmp3.i1 = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp.i2 = icmp sgt i32 %tmp3.i1, 0, !dbg !6
+  br i1 %cmp.i2, label %for.body.i.preheader, label %for.end.i, !dbg !6
+
+for.body.i.preheader:                             ; preds = %entry
+  %tmp4.i = load ptr, ptr addrspace(1) @arr, align 8
+  %0 = addrspacecast ptr %tmp4.i to ptr addrspace(1)
+  %xtraiter = and i32 %tmp3.i1, 15, !dbg !6
+  %1 = icmp samesign ult i32 %tmp3.i1, 16, !dbg !6
+  br i1 %1, label %for.end.i.loopexit.unr-lcssa, label %for.body.i.preheader.new, !dbg !6
+
+for.body.i.preheader.new:                         ; preds = %for.body.i.preheader
+  %unroll_iter = and i32 %tmp3.i1, 2147483632, !dbg !6
+  br label %for.body.i, !dbg !6
+
+for.body.i:                                       ; preds = %for.body.i, %for.body.i.preheader.new
+  %i.0.i4 = phi i32 [ 0, %for.body.i.preheader.new ], [ %inc.i.15, %for.body.i ]
+  %sum.0.i3 = phi i32 [ 0, %for.body.i.preheader.new ], [ %add.i.15, %for.body.i ]
+  %niter = phi i32 [ 0, %for.body.i.preheader.new ], [ %niter.next.15, %for.body.i ]
+  %2 = zext nneg i32 %i.0.i4 to i64, !dbg !6
+  %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %2, !dbg !6
+  %tmp6.i = load i32, ptr addrspace(1) %getElem, align 4, !dbg !6
+  %add.i = add nsw i32 %tmp6.i, %sum.0.i3, !dbg !6
+  %inc.i = add nuw nsw i32 %i.0.i4, 1, !dbg !6
+  %3 = zext nneg i32 %inc.i to i64, !dbg !6
+  %getElem.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %3, !dbg !6
+  %tmp6.i.1 = load i32, ptr addrspace(1) %getElem.1, align 4, !dbg !6
+  %add.i.1 = add nsw i32 %tmp6.i.1, %add.i, !dbg !6
+  %inc.i.1 = add nuw nsw i32 %i.0.i4, 2, !dbg !6
+  %4 = zext nneg i32 %inc.i.1 to i64, !dbg !6
+  %getElem.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %4, !dbg !6
+  %tmp6.i.2 = load i32, ptr addrspace(1) %getElem.2, align 4, !dbg !6
+  %add.i.2 = add nsw i32 %tmp6.i.2, %add.i.1, !dbg !6
+  %inc.i.2 = add nuw nsw i32 %i.0.i4, 3, !dbg !6
+  %5 = zext nneg i32 %inc.i.2 to i64, !dbg !6
+  %getElem.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %5, !dbg !6
+  %tmp6.i.3 = load i32, ptr addrspace(1) %getElem.3, align 4, !dbg !6
+  %add.i.3 = add nsw i32 %tmp6.i.3, %add.i.2, !dbg !6
+  %inc.i.3 = add nuw nsw i32 %i.0.i4, 4, !dbg !6
+  %6 = zext nneg i32 %inc.i.3 to i64, !dbg !6
+  %getElem.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %6, !dbg !6
+  %tmp6.i.4 = load i32, ptr addrspace(1) %getElem.4, align 4, !dbg !6
+  %add.i.4 = add nsw i32 %tmp6.i.4, %add.i.3, !dbg !6
+  %inc.i.4 = add nuw nsw i32 %i.0.i4, 5, !dbg !6
+  %7 = zext nneg i32 %inc.i.4 to i64, !dbg !6
+  %getElem.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %7, !dbg !6
+  %tmp6.i.5 = load i32, ptr addrspace(1) %getElem.5, align 4, !dbg !6
+  %add.i.5 = add nsw i32 %tmp6.i.5, %add.i.4, !dbg !6
+  %inc.i.5 = add nuw nsw i32 %i.0.i4, 6, !dbg !6
+  %8 = zext nneg i32 %inc.i.5 to i64, !dbg !6
+  %getElem.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %8, !dbg !6
+  %tmp6.i.6 = load i32, ptr addrspace(1) %getElem.6, align 4, !dbg !6
+  %add.i.6 = add nsw i32 %tmp6.i.6, %add.i.5, !dbg !6
+  %inc.i.6 = add nuw nsw i32 %i.0.i4, 7, !dbg !6
+  %9 = zext nneg i32 %inc.i.6 to i64, !dbg !6
+  %getElem.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %9, !dbg !6
+  %tmp6.i.7 = load i32, ptr addrspace(1) %getElem.7, align 4, !dbg !6
+  %add.i.7 = add nsw i32 %tmp6.i.7, %add.i.6, !dbg !6
+  %inc.i.7 = add nuw nsw i32 %i.0.i4, 8, !dbg !6
+  %10 = zext nneg i32 %inc.i.7 to i64, !dbg !6
+  %getElem.8 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %10, !dbg !6
+  %tmp6.i.8 = load i32, ptr addrspace(1) %getElem.8, align 4, !dbg !6
+  %add.i.8 = add nsw i32 %tmp6.i.8, %add.i.7, !dbg !6
+  %inc.i.8 = add nuw nsw i32 %i.0.i4, 9, !dbg !6
+  %11 = zext nneg i32 %inc.i.8 to i64, !dbg !6
+  %getElem.9 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %11, !dbg !6
+  %tmp6.i.9 = load i32, ptr addrspace(1) %getElem.9, align 4, !dbg !6
+  %add.i.9 = add nsw i32 %tmp6.i.9, %add.i.8, !dbg !6
+  %inc.i.9 = add nuw nsw i32 %i.0.i4, 10, !dbg !6
+  %12 = zext nneg i32 %inc.i.9 to i64, !dbg !6
+  %getElem.10 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %12, !dbg !6
+  %tmp6.i.10 = load i32, ptr addrspace(1) %getElem.10, align 4, !dbg !6
+  %add.i.10 = add nsw i32 %tmp6.i.10, %add.i.9, !dbg !6
+  %inc.i.10 = add nuw nsw i32 %i.0.i4, 11, !dbg !6
+  %13 = zext nneg i32 %inc.i.10 to i64, !dbg !6
+  %getElem.11 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %13, !dbg !6
+  %tmp6.i.11 = load i32, ptr addrspace(1) %getElem.11, align 4, !dbg !6
+  %add.i.11 = add nsw i32 %tmp6.i.11, %add.i.10, !dbg !6
+  %inc.i.11 = add nuw nsw i32 %i.0.i4, 12, !dbg !6
+  %14 = zext nneg i32 %inc.i.11 to i64, !dbg !6
+  %getElem.12 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %14, !dbg !6
+  %tmp6.i.12 = load i32, ptr addrspace(1) %getElem.12, align 4, !dbg !6
+  %add.i.12 = add nsw i32 %tmp6.i.12, %add.i.11, !dbg !6
+  %inc.i.12 = add nuw nsw i32 %i.0.i4, 13, !dbg !6
+  %15 = zext nneg i32 %inc.i.12 to i64, !dbg !6
+  %getElem.13 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %15, !dbg !6
+  %tmp6.i.13 = load i32, ptr addrspace(1) %getElem.13, align 4, !dbg !6
+  %add.i.13 = add nsw i32 %tmp6.i.13, %add.i.12, !dbg !6
+  %inc.i.13 = add nuw nsw i32 %i.0.i4, 14, !dbg !6
+  %16 = zext nneg i32 %inc.i.13 to i64, !dbg !6
+  %getElem.14 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %16, !dbg !6
+  %tmp6.i.14 = load i32, ptr addrspace(1) %getElem.14, align 4, !dbg !6
+  %add.i.14 = add nsw i32 %tmp6.i.14, %add.i.13, !dbg !6
+  %inc.i.14 = add nuw nsw i32 %i.0.i4, 15, !dbg !6
+  %17 = zext nneg i32 %inc.i.14 to i64, !dbg !6
+  %getElem.15 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %17, !dbg !6
+  %tmp6.i.15 = load i32, ptr addrspace(1) %getElem.15, align 4, !dbg !6
+  %add.i.15 = add nsw i32 %tmp6.i.15, %add.i.14, !dbg !6
+  %inc.i.15 = add nuw nsw i32 %i.0.i4, 16, !dbg !6
+  %niter.next.15 = add i32 %niter, 16, !dbg !6
+  %niter.ncmp.15.not = icmp eq i32 %niter.next.15, %unroll_iter, !dbg !6
+  br i1 %niter.ncmp.15.not, label %for.end.i.loopexit.unr-lcssa, label %for.body.i, !dbg !6, !llvm.loop !11
+
+for.end.i.loopexit.unr-lcssa:                     ; preds = %for.body.i, %for.body.i.preheader
+  %add.i.lcssa.ph = phi i32 [ poison, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+  %i.0.i4.unr = phi i32 [ 0, %for.body.i.preheader ], [ %inc.i.15, %for.body.i ]
+  %sum.0.i3.unr = phi i32 [ 0, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+  %lcmp.mod.not = icmp eq i32 %xtraiter, 0, !dbg !6
+  br i1 %lcmp.mod.not, label %for.end.i, label %for.body.i.epil.preheader, !dbg !6
+
+for.body.i.epil.preheader:                        ; preds = %for.end.i.loopexit.unr-lcssa
+  %xtraiter6 = and i32 %tmp3.i1, 7, !dbg !6
+  %18 = icmp samesign ult i32 %xtraiter, 8, !dbg !6
+  br i1 %18, label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, label %for.body.i.epil, !dbg !6
+
+for.body.i.epil:                                  ; preds = %for.body.i.epil.preheader
+  %19 = zext nneg i32 %i.0.i4.unr to i64, !dbg !6
+  %getElem.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %19, !dbg !6
+  %tmp6.i.epil = load i32, ptr addrspace(1) %getElem.epil, align 4, !dbg !6
+  %add.i.epil = add nsw i32 %tmp6.i.epil, %sum.0.i3.unr, !dbg !6
+  %inc.i.epil = add nuw nsw i32 %i.0.i4.unr, 1, !dbg !6
+  %20 = zext nneg i32 %inc.i.epil to i64, !dbg !6
+  %getElem.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %20, !dbg !6
+  %tmp6.i.epil.1 = load i32, ptr addrspace(1) %getElem.epil.1, align 4, !dbg !6
+  %add.i.epil.1 = add nsw i32 %tmp6.i.epil.1, %add.i.epil, !dbg !6
+  %inc.i.epil.1 = add nuw nsw i32 %i.0.i4.unr, 2, !dbg !6
+  %21 = zext nneg i32 %inc.i.epil.1 to i64, !dbg !6
+  %getElem.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %21, !dbg !6
+  %tmp6.i.epil.2 = load i32, ptr addrspace(1) %getElem.epil.2, align 4, !dbg !6
+  %add.i.epil.2 = add nsw i32 %tmp6.i.epil.2, %add.i.epil.1, !dbg !6
+  %inc.i.epil.2 = add nuw nsw i32 %i.0.i4.unr, 3, !dbg !6
+  %22 = zext nneg i32 %inc.i.epil.2 to i64, !dbg !6
+  %getElem.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %22, !dbg !6
+  %tmp6.i.epil.3 = load i32, ptr addrspace(1) %getElem.epil.3, align 4, !dbg !6
+  %add.i.epil.3 = add nsw i32 %tmp6.i.epil.3, %add.i.epil.2, !dbg !6
+  %inc.i.epil.3 = add nuw nsw i32 %i.0.i4.unr, 4, !dbg !6
+  %23 = zext nneg i32 %inc.i.epil.3 to i64, !dbg !6
+  %getElem.epil.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %23, !dbg !6
+  %tmp6.i.epil.4 = load i32, ptr addrspace(1) %getElem.epil.4, align 4, !dbg !6
+  %add.i.epil.4 = add nsw i32 %tmp6.i.epil.4, %add.i.epil.3, !dbg !6
+  %inc.i.epil.4 = add nuw nsw i32 %i.0.i4.unr, 5, !dbg !6
+  %24 = zext nneg i32 %inc.i.epil.4 to i64, !dbg !6
+  %getElem.epil.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %24, !dbg !6
+  %tmp6.i.epil.5 = load i32, ptr addrspace(1) %getElem.epil.5, align 4, !dbg !6
+  %add.i.epil.5 = add nsw i32 %tmp6.i.epil.5, %add.i.epil.4, !dbg !6
+  %inc.i.epil.5 = add nuw nsw i32 %i.0.i4.unr, 6, !dbg !6
+  %25 = zext nneg i32 %inc.i.epil.5 to i64, !dbg !6
+  %getElem.epil.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %25, !dbg !6
+  %tmp6.i.epil.6 = load i32, ptr addrspace(1) %getElem.epil.6, align 4, !dbg !6
+  %add.i.epil.6 = add nsw i32 %tmp6.i.epil.6, %add.i.epil.5, !dbg !6
+  %inc.i.epil.6 = add nuw nsw i32 %i.0.i4.unr, 7, !dbg !6
+  %26 = zext nneg i32 %inc.i.epil.6 to i64, !dbg !6
+  %getElem.epil.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %26, !dbg !6
+  %tmp6.i.epil.7 = load i32, ptr addrspace(1) %getElem.epil.7, align 4, !dbg !6
+  %add.i.epil.7 = add nsw i32 %tmp6.i.epil.7, %add.i.epil.6, !dbg !6
+  %inc.i.epil.7 = add nuw nsw i32 %i.0.i4.unr, 8, !dbg !6
+  br label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.unr-lcssa:        ; preds = %for.body.i.epil, %for.body.i.epil.preheader
+  %add.i.lcssa.ph5.ph = phi i32 [ poison, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+  %i.0.i4.epil.unr = phi i32 [ %i.0.i4.unr, %for.body.i.epil.preheader ], [ %inc.i.epil.7, %for.body.i.epil ]
+  %sum.0.i3.epil.unr = phi i32 [ %sum.0.i3.unr, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+  %lcmp.mod8.not = icmp eq i32 %xtraiter6, 0, !dbg !6
+  br i1 %lcmp.mod8.not, label %for.end.i, label %for.body.i.epil.epil.preheader, !dbg !6
+
+for.body.i.epil.epil.preheader:                   ; preds = %for.end.i.loopexit.epilog-lcssa.unr-lcssa
+  %xtraiter12 = and i32 %tmp3.i1, 3, !dbg !6
+  %27 = icmp samesign ult i32 %xtraiter6, 4, !dbg !6
+  br i1 %27, label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, label %for.body.i.epil.epil, !dbg !6
+
+for.body.i.epil.epil:                             ; preds = %for.body.i.epil.epil.preheader
+  %28 = zext nneg i32 %i.0.i4.epil.unr to i64, !dbg !6
+  %getElem.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %28, !dbg !6
+  %tmp6.i.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil, align 4, !dbg !6
+  %add.i.epil.epil = add nsw i32 %tmp6.i.epil.epil, %sum.0.i3.epil.unr, !dbg !6
+  %inc.i.epil.epil = add nuw nsw i32 %i.0.i4.epil.unr, 1, !dbg !6
+  %29 = zext nneg i32 %inc.i.epil.epil to i64, !dbg !6
+  %getElem.epil.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %29, !dbg !6
+  %tmp6.i.epil.epil.1 = load i32, ptr addrspace(1) %getElem.epil.epil.1, align 4, !dbg !6
+  %add.i.epil.epil.1 = add nsw i32 %tmp6.i.epil.epil.1, %add.i.epil.epil, !dbg !6
+  %inc.i.epil.epil.1 = add nuw nsw i32 %i.0.i4.epil.unr, 2, !dbg !6
+  %30 = zext nneg i32 %inc.i.epil.epil.1 to i64, !dbg !6
+  %getElem.epil.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %30, !dbg !6
+  %tmp6.i.epil.epil.2 = load i32, ptr addrspace(1) %getElem.epil.epil.2, align 4, !dbg !6
+  %add.i.epil.epil.2 = add nsw i32 %tmp6.i.epil.epil.2, %add.i.epil.epil.1, !dbg !6
+  %inc.i.epil.epil.2 = add nuw nsw i32 %i.0.i4.epil.unr, 3, !dbg !6
+  %31 = zext nneg i32 %inc.i.epil.epil.2 to i64, !dbg !6
+  %getElem.epil.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %31, !dbg !6
+  %tmp6.i.epil.epil.3 = load i32, ptr addrspace(1) %getElem.epil.epil.3, align 4, !dbg !6
+  %add.i.epil.epil.3 = add nsw i32 %tmp6.i.epil.epil.3, %add.i.epil.epil.2, !dbg !6
+  %inc.i.epil.epil.3 = add nuw nsw i32 %i.0.i4.epil.unr, 4, !dbg !6
+  br label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa: ; preds = %for.body.i.epil.epil, %for.body.i.epil.epil.preheader
+  %add.i.lcssa.ph5.ph9.ph = phi i32 [ poison, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+  %i.0.i4.epil.epil.unr = phi i32 [ %i.0.i4.epil.unr, %for.body.i.epil.epil.preheader ], [ %inc.i.epil.epil.3, %for.body.i.epil.epil ]
+  %sum.0.i3.epil.epil.unr = phi i32 [ %sum.0.i3.epil.unr, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+  %lcmp.mod14.not = icmp eq i32 %xtraiter12, 0, !dbg !6
+  br i1 %lcmp.mod14.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6
+
+for.body.i.epil.epil.epil:                        ; preds = %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.body.i.epil.epil.epil
+  %i.0.i4.epil.epil.epil = phi i32 [ %inc.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %i.0.i4.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %sum.0.i3.epil.epil.epil = phi i32 [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %sum.0.i3.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %epil.iter13 = phi i32 [ %epil.iter13.next, %for.body.i.epil.epil.epil ], [ 0, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+  %32 = zext nneg i32 %i.0.i4.epil.epil.epil to i64, !dbg !6
+  %getElem.epil.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %32, !dbg !6
+  %tmp6.i.epil.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil.epil, align 4, !dbg !6
+  %add.i.epil.epil.epil = add nsw i32 %tmp6.i.epil.epil.epil, %sum.0.i3.epil.epil.epil, !dbg !6
+  %inc.i.epil.epil.epil = add nuw nsw i32 %i.0.i4.epil.epil.epil, 1, !dbg !6
+  %epil.iter13.next = add i32 %epil.iter13, 1, !dbg !6
+  %epil.iter13.cmp.not = icmp eq i32 %epil.iter13.next, %xtraiter12, !dbg !6
+  br i1 %epil.iter13.cmp.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6, !llvm.loop !14
+
+for.end.i:                                        ; preds = %for.body.i.epil.epil.epil, %for.end.i.loopexit.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.unr-lcssa, %entry
+  %sum.0.i.lcssa = phi i32 [ 0, %entry ], [ %add.i.lcssa.ph, %for.end.i.loopexit.unr-lcssa ], [ %add.i.lcssa.ph5.ph, %for.end.i.loopexit.epilog-lcssa.unr-lcssa ], [ %add.i.lcssa.ph5.ph9.ph, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ], [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], !dbg !6
+  %cmp10.i = icmp sgt i32 %sum.0.i.lcssa, 17, !dbg !15
+  %spec.select = select i1 %cmp10.i, i32 %sum.0.i.lcssa, i32 1, !dbg !15
+  %cmp = icmp sgt i32 %n, 7, !dbg !16
+  br i1 %cmp, label %if.then, label %if.end, !dbg !16
+
+if.then:                                          ; preds = %for.end.i
+  store i32 %spec.select, ptr addrspace(1) @gg, align 4, !dbg !17
+  br label %if.end, !dbg !17
+
+if.end:                                           ; preds = %if.then, %for.end.i
+  ret void, !dbg !19
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t3.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kerneli", scope: !1, file: !1, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 14, column: 3, scope: !7, inlinedAt: !9)
+!7 = distinct !DILexicalBlock(scope: !8, file: !1, line: 12, column: 27)
+!8 = distinct !DISubprogram(name: "C", linkageName: "_ZN1CC1Ev", scope: !1, file: !1, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = distinct !DILocation(line: 24, column: 3, scope: !10)
+!10 = distinct !DILexicalBlock(scope: !4, file: !1, line: 23, column: 29)
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = !{!"llvm.loop.unroll.disable"}
+!14 = distinct !{!14, !13}
+!15 = !DILocation(line: 15, column: 3, scope: !7, inlinedAt: !9)
+!16 = !DILocation(line: 25, column: 3, scope: !10)
+!17 = !DILocation(line: 26, column: 5, scope: !18)
+!18 = distinct !DILexicalBlock(scope: !10, file: !1, line: 25, column: 3)
+!19 = !DILocation(line: 27, column: 1, scope: !10)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
new file mode 100644
index 0000000000000..57df39e8034c4
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
@@ -0,0 +1,229 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test multiple inline calls at the same level - verifies that inlined_at information
+;; is correctly emitted when multiple different functions (or multiple copies of the same function) are inlined into a single caller.
+;
+; __device__ __forceinline__ int foo(int a)
+; {
+;   if (a > 7)
+;     return a*a;
+;   return ++a;
+; }
+;
+; __device__ __forceinline__ int baz(int a)
+; {
+;   if (a > 23)
+;     return a*2;
+;   return ++a;
+; }
+;
+; __device__ int bar(int i, int j)
+; {
+;   return i + j;
+; }
+;
+; __device__ int d;
+;
+; // inlining two different functions
+; __global__ void kernel1(int x, int y)
+; {
+;   d = bar(foo(x), baz(y));
+; }
+;
+; // inlining two copies of same function
+; __global__ void kernel2(int x, int y)
+; {
+;   d = bar(foo(x), foo(y));
+; }
+;
+; // inlining two different functions, extra computation in caller (y+x)
+; __global__ void kernel3(int x, int y)
+; {
+;   d = bar(foo(x), baz(y + x));
+; }
+;
+; // inlining two copies of same function, extra computation in caller (y+x)
+; __global__ void kernel4(int x, int y)
+; {
+;   d = bar(foo(x), foo(y + x));
+; }
+;
+; CHECK: .entry _Z7kernel1ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+;
+; CHECK: .entry _Z7kernel2ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 31
+;
+; CHECK: .entry _Z7kernel3ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 37
+;
+; CHECK: .entry _Z7kernel4ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 43
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3fooi
+; CHECK: [[BAZNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3bazi
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at d = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [5 x ptr] [ptr @_Z7kernel1ii, ptr @_Z7kernel2ii, ptr @_Z7kernel3ii, ptr @_Z7kernel4ii, ptr addrspacecast (ptr addrspace(1) @d to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel1ii(i32 noundef %x, i32 noundef %y) #0 !dbg !11 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !13
+  %mul.i = mul nsw i32 %x, %x, !dbg !13
+  %inc.i = add nsw i32 %x, 1, !dbg !13
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !13
+  %cmp.i1 = icmp sgt i32 %y, 23, !dbg !18
+  %mul.i2 = shl nuw nsw i32 %y, 1, !dbg !18
+  %inc.i3 = add nsw i32 %y, 1, !dbg !18
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !18
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !22
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !26
+  ret void, !dbg !27
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel2ii(i32 noundef %x, i32 noundef %y) #0 !dbg !28 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !29
+  %mul.i = mul nsw i32 %x, %x, !dbg !29
+  %inc.i = add nsw i32 %x, 1, !dbg !29
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !29
+  %cmp.i1 = icmp sgt i32 %y, 7, !dbg !32
+  %mul.i2 = mul nsw i32 %y, %y, !dbg !32
+  %inc.i3 = add nsw i32 %y, 1, !dbg !32
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !32
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !34
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel3ii(i32 noundef %x, i32 noundef %y) #0 !dbg !38 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !39
+  %mul.i = mul nsw i32 %x, %x, !dbg !39
+  %inc.i = add nsw i32 %x, 1, !dbg !39
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !39
+  %add = add nsw i32 %y, %x, !dbg !42
+  %cmp.i1 = icmp sgt i32 %add, 23, !dbg !43
+  %mul.i2 = shl nuw nsw i32 %add, 1, !dbg !43
+  %inc.i3 = add nsw i32 %add, 1, !dbg !43
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !43
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !45
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !42
+  ret void, !dbg !47
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel4ii(i32 noundef %x, i32 noundef %y) #0 !dbg !48 {
+entry:
+  %cmp.i = icmp sgt i32 %x, 7, !dbg !49
+  %mul.i = mul nsw i32 %x, %x, !dbg !49
+  %inc.i = add nsw i32 %x, 1, !dbg !49
+  %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !49
+  %add = add nsw i32 %y, %x, !dbg !52
+  %cmp.i1 = icmp sgt i32 %add, 7, !dbg !53
+  %mul.i2 = mul nsw i32 %add, %add, !dbg !53
+  %inc.i3 = add nsw i32 %add, 1, !dbg !53
+  %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !53
+  %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !55
+  store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !52
+  ret void, !dbg !57
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvmir.version = !{!2, !3, !4}
+!nvvm.annotations = !{!5, !6, !7, !8}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t4.cu", directory: "")
+!2 = !{i32 2, i32 0, i32 3, i32 2}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, i32 0, i32 3, i32 1}
+!5 = !{ptr @_Z7kernel1ii, !"kernel", i32 1}
+!6 = !{ptr @_Z7kernel2ii, !"kernel", i32 1}
+!7 = !{ptr @_Z7kernel3ii, !"kernel", i32 1}
+!8 = !{ptr @_Z7kernel4ii, !"kernel", i32 1}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{}
+!11 = distinct !DISubprogram(name: "kernel1", linkageName: "_Z7kernel1ii", scope: !1, file: !1, line: 23, type: !12, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!12 = !DISubroutineType(types: !10)
+!13 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlock(scope: !15, file: !1, line: 2, column: 1)
+!15 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !11, file: !1, line: 24, column: 1)
+!18 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 9, column: 1)
+!20 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazi", scope: !1, file: !1, line: 8, type: !12, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!22 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 16, column: 1)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", scope: !1, file: !1, line: 15, type: !12, scopeLine: 15, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!26 = !DILocation(line: 25, column: 3, scope: !17)
+!27 = !DILocation(line: 26, column: 1, scope: !17)
+!28 = distinct !DISubprogram(name: "kernel2", linkageName: "_Z7kernel2ii", scope: !1, file: !1, line: 29, type: !12, scopeLine: 29, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!29 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !30)
+!30 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 30, column: 1)
+!32 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !33)
+!33 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!34 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !35)
+!35 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!36 = !DILocation(line: 31, column: 3, scope: !31)
+!37 = !DILocation(line: 32, column: 1, scope: !31)
+!38 = distinct !DISubprogram(name: "kernel3", linkageName: "_Z7kernel3ii", scope: !1, file: !1, line: 35, type: !12, scopeLine: 35, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!39 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !40)
+!40 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!41 = distinct !DILexicalBlock(scope: !38, file: !1, line: 36, column: 1)
+!42 = !DILocation(line: 37, column: 3, scope: !41)
+!43 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !44)
+!44 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!45 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !46)
+!46 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!47 = !DILocation(line: 38, column: 1, scope: !41)
+!48 = distinct !DISubprogram(name: "kernel4", linkageName: "_Z7kernel4ii", scope: !1, file: !1, line: 41, type: !12, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!49 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !50)
+!50 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !48, file: !1, line: 42, column: 1)
+!52 = !DILocation(line: 43, column: 3, scope: !51)
+!53 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !54)
+!54 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!55 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !56)
+!56 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!57 = !DILocation(line: 44, column: 1, scope: !51)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
new file mode 100644
index 0000000000000..dc242a95f54d8
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test deep inline chain - verifies that inlined_at information is correctly emitted
+;; through an 11-level deep chain of inlining (foo0 through foo10).
+;
+; __device__ int foo0(int a, int b) {
+;   if (a > b)
+;     return --a;
+;   if (a > b - 7)
+;     return a*2;
+;   return ++a;
+; }
+; __device__ int foo1(int a, int b) {
+;   return foo0(a*3, b*b);
+; }
+; __device__ int foo2(int a, int b) {
+;   return foo1(a+2, b*b);
+; }
+; __device__ int foo3(int a, int b) {
+;   return foo2(a+100, b-7);
+; }
+; __device__ int foo4(int a, int b) {
+;   return foo3(a*a, b*3);
+; }
+; __device__ int foo5(int a, int b) {
+;   return foo4(a*3, b*3);
+; }
+; __device__ int foo6(int a, int b) {
+;   return foo5(a*3, b*3);
+; }
+; __device__ int foo7(int a, int b) {
+;   return foo6(a*a + 2, b*b*5);
+; }
+; __device__ int foo8(int a, int b) {
+;   return foo7(a*2, b*2*a);
+; }
+; __device__ int foo9(int a, int b) {
+;   return foo8(a*2, b*2*a);
+; }
+; __device__ int foo10(int a, int b) {
+;   return foo9(a*2*b, b*2);
+; }
+;
+; __device__ int g;
+;
+; __global__ void kernel(int a, int b) {
+;   g = foo10(a, b);
+; }
+;
+; CHECK: .entry _Z6kernelii(
+; CHECK: .loc [[FILENUM:[1-9]]] 42
+; CHECK: .loc [[FILENUM]] 36 {{[0-9]*}}, function_name [[FOO10NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 42
+; CHECK: .loc [[FILENUM]] 33 {{[0-9]*}}, function_name [[FOO9NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 36
+; CHECK: .loc [[FILENUM]] 30 {{[0-9]*}}, function_name [[FOO8NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 33
+; CHECK: .loc [[FILENUM]] 27 {{[0-9]*}}, function_name [[FOO7NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 30
+; CHECK: .loc [[FILENUM]] 24 {{[0-9]*}}, function_name [[FOO6NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 27
+; CHECK: .loc [[FILENUM]] 21 {{[0-9]*}}, function_name [[FOO5NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .loc [[FILENUM]] 18 {{[0-9]*}}, function_name [[FOO4NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[FOO3NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 12 {{[0-9]*}}, function_name [[FOO2NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 15
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOO1NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 12
+; CHECK: .loc [[FILENUM]] 2 {{[0-9]*}}, function_name [[FOO0NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 9
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOO0NAME]], inlined_at [[FILENUM]] 9
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOO10NAME]]:
+; CHECK-NEXT: // {{.*}} _Z5foo10ii
+; CHECK: [[FOO9NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo9ii
+; CHECK: [[FOO8NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo8ii
+; CHECK: [[FOO7NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo7ii
+; CHECK: [[FOO6NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo6ii
+; CHECK: [[FOO5NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo5ii
+; CHECK: [[FOO4NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo4ii
+; CHECK: [[FOO3NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo3ii
+; CHECK: [[FOO2NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo2ii
+; CHECK: [[FOO1NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo1ii
+; CHECK: [[FOO0NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo0ii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at g = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelii, ptr addrspacecast (ptr addrspace(1) @g to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelii(i32 noundef %a, i32 noundef %b) #0 !dbg !5 {
+entry:
+  %mul.i = shl nsw i32 %a, 1, !dbg !7
+  %mul2.i = mul nsw i32 %mul.i, %b, !dbg !7
+  %mul.i.i = shl nsw i32 %mul2.i, 1, !dbg !12
+  %mul.i.i.i = shl nsw i32 %mul2.i, 2, !dbg !16
+  %mul4.i.i = shl i32 %b, 3, !dbg !12
+  %mul2.i.i.i = mul i32 %mul4.i.i, %mul2.i, !dbg !16
+  %mul4.i.i.i = mul nsw i32 %mul2.i.i.i, %mul.i.i, !dbg !16
+  %mul.i.i.i.i = mul i32 %mul2.i, 36, !dbg !20
+  %0 = mul i32 %mul.i.i.i.i, %mul.i.i.i, !dbg !24
+  %mul.i.i.i.i.i.i = add nuw nsw i32 %0, 18, !dbg !24
+  %mul4.i.i.i.i = mul i32 %mul4.i.i.i, 135, !dbg !20
+  %mul3.i.i.i.i.i.i.i = mul i32 %mul4.i.i.i.i, %mul4.i.i.i, !dbg !31
+  %sub.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i, -7, !dbg !35
+  %mul.i.i.i.i.i.i.i.i.i = mul nsw i32 %sub.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i, !dbg !39
+  %mul.i.i.i.i.i.i.i = mul i32 %mul.i.i.i.i.i.i, 3, !dbg !31
+  %1 = mul i32 %mul.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i, !dbg !43
+  %mul.i.i.i.i.i.i.i.i.i.i = add i32 %1, 306, !dbg !43
+  %mul3.i.i.i.i.i.i.i.i.i.i = mul nuw nsw i32 %mul.i.i.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i.i.i.i, !dbg !43
+  %cmp.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %mul3.i.i.i.i.i.i.i.i.i.i, !dbg !47
+  br i1 %cmp.i.i.i.i.i.i.i.i.i.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i.i, label %if.end.i.i.i.i.i.i.i.i.i.i.i, !dbg !47
+
+if.then.i.i.i.i.i.i.i.i.i.i.i:                    ; preds = %entry
+  %dec.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 305, !dbg !51
+  br label %_Z5foo10ii.exit, !dbg !51
+
+if.end.i.i.i.i.i.i.i.i.i.i.i:                     ; preds = %entry
+  %sub.i.i.i.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i.i.i.i, -7, !dbg !53
+  %cmp5.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+  br i1 %cmp5.i.i.i.i.i.i.i.i.i.i.i, label %if.then6.i.i.i.i.i.i.i.i.i.i.i, label %if.end8.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+
+if.then6.i.i.i.i.i.i.i.i.i.i.i:                   ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+  %mul.i.i.i.i.i.i.i.i.i.i.i = shl nsw i32 %mul.i.i.i.i.i.i.i.i.i.i, 1, !dbg !54
+  br label %_Z5foo10ii.exit, !dbg !54
+
+if.end8.i.i.i.i.i.i.i.i.i.i.i:                    ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+  %inc.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 307, !dbg !56
+  br label %_Z5foo10ii.exit, !dbg !56
+
+_Z5foo10ii.exit:                                  ; preds = %if.then.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i
+  %retval.0.i.i.i.i.i.i.i.i.i.i.i = phi i32 [ %dec.i.i.i.i.i.i.i.i.i.i.i, %if.then.i.i.i.i.i.i.i.i.i.i.i ], [ %mul.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i ], [ %inc.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i ], !dbg !56
+  store i32 %retval.0.i.i.i.i.i.i.i.i.i.i.i, ptr addrspace(1) @g, align 4, !dbg !57
+  ret void, !dbg !58
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "t5.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelii, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelii", scope: !1, file: !1, line: 41, type: !6, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 35, column: 28)
+!9 = distinct !DISubprogram(name: "foo10", linkageName: "_Z5foo10ii", scope: !1, file: !1, line: 35, type: !6, scopeLine: 35, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 42, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 41, column: 29)
+!12 = !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 32, column: 28)
+!14 = distinct !DISubprogram(name: "foo9", linkageName: "_Z4foo9ii", scope: !1, file: !1, line: 32, type: !6, scopeLine: 32, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!16 = !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 29, column: 28)
+!18 = distinct !DISubprogram(name: "foo8", linkageName: "_Z4foo8ii", scope: !1, file: !1, line: 29, type: !6, scopeLine: 29, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!19 = distinct !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!20 = !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 26, column: 28)
+!22 = distinct !DISubprogram(name: "foo7", linkageName: "_Z4foo7ii", scope: !1, file: !1, line: 26, type: !6, scopeLine: 26, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!23 = distinct !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!24 = !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 20, column: 28)
+!26 = distinct !DISubprogram(name: "foo5", linkageName: "_Z4foo5ii", scope: !1, file: !1, line: 20, type: !6, scopeLine: 20, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 24, column: 3, scope: !28, inlinedAt: !30)
+!28 = distinct !DILexicalBlock(scope: !29, file: !1, line: 23, column: 28)
+!29 = distinct !DISubprogram(name: "foo6", linkageName: "_Z4foo6ii", scope: !1, file: !1, line: 23, type: !6, scopeLine: 23, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!30 = distinct !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!31 = !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!32 = distinct !DILexicalBlock(scope: !33, file: !1, line: 17, column: 28)
+!33 = distinct !DISubprogram(name: "foo4", linkageName: "_Z4foo4ii", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!34 = distinct !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!35 = !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!36 = distinct !DILexicalBlock(scope: !37, file: !1, line: 14, column: 28)
+!37 = distinct !DISubprogram(name: "foo3", linkageName: "_Z4foo3ii", scope: !1, file: !1, line: 14, type: !6, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!38 = distinct !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!39 = !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!40 = distinct !DILexicalBlock(scope: !41, file: !1, line: 11, column: 28)
+!41 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2ii", scope: !1, file: !1, line: 11, type: !6, scopeLine: 11, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!42 = distinct !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!43 = !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 8, column: 28)
+!45 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ii", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!46 = distinct !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!47 = !DILocation(line: 2, column: 3, scope: !48, inlinedAt: !50)
+!48 = distinct !DILexicalBlock(scope: !49, file: !1, line: 1, column: 28)
+!49 = distinct !DISubprogram(name: "foo0", linkageName: "_Z4foo0ii", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!50 = distinct !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!51 = !DILocation(line: 3, column: 5, scope: !52, inlinedAt: !50)
+!52 = distinct !DILexicalBlock(scope: !48, file: !1, line: 2, column: 3)
+!53 = !DILocation(line: 4, column: 3, scope: !48, inlinedAt: !50)
+!54 = !DILocation(line: 5, column: 5, scope: !55, inlinedAt: !50)
+!55 = distinct !DILexicalBlock(scope: !48, file: !1, line: 4, column: 3)
+!56 = !DILocation(line: 6, column: 3, scope: !48, inlinedAt: !50)
+!57 = !DILocation(line: 42, column: 3, scope: !11)
+!58 = !DILocation(line: 43, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
new file mode 100644
index 0000000000000..8c5e48814c59b
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
@@ -0,0 +1,343 @@
+; RUN: llc < %s -mattr=+ptx72 -O0 | FileCheck %s
+;
+;; Test same function inlined multiple times from different call sites - verifies that
+;; inlined_at information correctly distinguishes multiple inline instances of rrand() and znew() called from different locations within a loop.
+;
+; typedef unsigned long long MYSIZE_T; // avoid platform dependence
+; struct GridOpt {
+;     int launch;
+;     int sync;
+;     int size;
+;     MYSIZE_T itr;
+; };
+;
+; enum LaunchOpt {
+;     LaunchOptPerThreadStream = 0,
+;     LaunchOptBlockSharedStream,
+;     LaunchOptAllNullStream,
+;     LaunchOptForkBomb,
+;     LaunchOptLoopLaunch,
+;     LaunchOptRand,
+;     LaunchOptSize,
+; };
+;
+; // post launch sync options
+; enum SyncOpt {
+;     SyncOptAllThreadsSync = 0,
+;     SyncOptOneSyncPerBlock,
+;     SyncOptRand,
+;     SyncOptSize,
+; };
+;
+; // size of launch options
+; enum SizeOpt {
+;     SizeOptSingleWarp = 0,
+;     SizeOptMultiWarp,
+;     SizeOptMultiBlock,
+;     SizeOptRand,
+;     SizeOptSize,
+; };
+;
+;
+; // device side failure codes
+; enum ErrorStatus {
+;     Success = 0,
+;     DeviceRuntimeFailure  = 1,
+;     DeviceMallocFailure   = 2,
+;     DeviceHardwareFailure = 3,
+;     InvalidInput          = 4,
+; };
+;
+; __device__ MYSIZE_T znew( MYSIZE_T seed )
+; {
+;     return 36969 * ( seed & 65535 ) + seed >> 16;
+; }
+;
+; __device__ MYSIZE_T rrand( MYSIZE_T *seed )
+; {
+;     *seed = znew( *seed );
+;     return *seed;
+; }
+;
+; __device__ GridOpt loopOpt;
+;
+; __global__ void cnpWideLaunch(GridOpt opt, MYSIZE_T maxLaunches, MYSIZE_T randomSeed, int *status, int *launchCounts)
+; {
+;     // MYSIZE_T threadSeed = randomSeed + blockDim.x * blockIdx.x + threadIdx.x;
+;     MYSIZE_T blockSeed  = randomSeed + blockIdx.x;
+;
+;     // this device launch consumes a launch slot
+;     maxLaunches--;
+;
+;     // compute number of launches per block
+;     MYSIZE_T blockLaunches = maxLaunches  / gridDim.x;
+;     MYSIZE_T extraLaunches = maxLaunches - gridDim.x * blockLaunches;
+;     if (blockIdx.x < extraLaunches) {
+;         blockLaunches++;
+;     }
+;
+;     // clear launchcount with a thread in each block
+;     if (threadIdx.x == 0)
+;         launchCounts[blockIdx.x] = 0;
+;
+;     // compute per block random selections for sync/launch size/stream
+;     for (MYSIZE_T i = 0; i < opt.itr; i++) {
+;         loopOpt = opt;
+;         if (opt.launch == LaunchOptRand) {
+;             loopOpt.launch = rrand(&blockSeed) % (LaunchOptSize - 1);
+;         }
+;         if (opt.sync == SyncOptRand) {
+;             loopOpt.sync = rrand(&blockSeed) % (SyncOptSize - 1);
+;         }
+;         if (opt.size == SizeOptRand) {
+;             loopOpt.size = rrand(&blockSeed) % (SizeOptSize - 1);
+;         }
+;         __syncthreads();
+;         if (threadIdx.x == 0) {
+;             //printf("block %d launchCount %d blockLaunches:%d\n", blockIdx.x, launchCounts[blockIdx.x], blockLaunches);
+;             int launchCount = launchCounts[blockIdx.x];
+;
+;             // fail if block did not generate enough launches
+;             if (!*status && (launchCount != blockLaunches)) {
+;                 *status = DeviceHardwareFailure;
+;             }
+;             // clear last iteration's launch
+;             launchCounts[blockIdx.x] = 0;
+;         }
+;         __syncthreads();
+;
+;         // fail if a global error is present
+;         if (*status) {
+;             return;
+;         }
+;     }
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 84
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 84
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 87
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 87
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 90
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 90
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[RRANDNAME]]:
+; CHECK-NEXT: // {{.*}} _Z5rrandPy
+; CHECK: [[ZNEWNAME]]:
+; CHECK-NEXT: // {{.*}} _Z4znewy
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.GridOpt = type { i32, i32, i32, i64 }
+
+ at loopOpt = internal addrspace(1) global %struct.GridOpt zeroinitializer, align 8
+ at llvm.used = appending global [2 x ptr] [ptr @_Z13cnpWideLaunch7GridOptyyPiS0_, ptr addrspacecast (ptr addrspace(1) @loopOpt to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline convergent mustprogress norecurse nounwind willreturn
+define void @_Z13cnpWideLaunch7GridOptyyPiS0_(%struct.GridOpt noundef %opt, i64 noundef %maxLaunches, i64 noundef %randomSeed, ptr noundef captures(none) %status, ptr noundef captures(none) %launchCounts) #0 !dbg !4 {
+entry:
+  %0 = addrspacecast ptr %status to ptr addrspace(1)
+  %1 = addrspacecast ptr %launchCounts to ptr addrspace(1)
+  %opt.fca.0.extract = extractvalue %struct.GridOpt %opt, 0
+  %opt.fca.1.extract = extractvalue %struct.GridOpt %opt, 1
+  %opt.fca.2.extract = extractvalue %struct.GridOpt %opt, 2
+  %opt.fca.3.extract = extractvalue %struct.GridOpt %opt, 3
+  %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !6
+  %conv = zext nneg i32 %2 to i64, !dbg !6
+  %add = add i64 %randomSeed, %conv, !dbg !6
+  %dec = add i64 %maxLaunches, -1, !dbg !8
+  %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !dbg !9
+  %conv6 = zext nneg i32 %3 to i64, !dbg !9
+  %div = udiv i64 %dec, %conv6, !dbg !9
+  %mul = mul i64 %div, %conv6, !dbg !10
+  %sub = sub i64 %dec, %mul, !dbg !10
+  %cmp = icmp ugt i64 %sub, %conv, !dbg !11
+  %inc = zext i1 %cmp to i64, !dbg !11
+  %spec.select = add i64 %div, %inc, !dbg !11
+  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12
+  %cmp20 = icmp eq i32 %4, 0, !dbg !12
+  br i1 %cmp20, label %if.then22, label %if.end26, !dbg !12
+
+if.then22:                                        ; preds = %entry
+  %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv, !dbg !13
+  store i32 0, ptr addrspace(1) %getElem, align 4, !dbg !13
+  br label %if.end26, !dbg !13
+
+if.end26:                                         ; preds = %if.then22, %entry
+  %cmp3021.not = icmp eq i64 %opt.fca.3.extract, 0, !dbg !15
+  br i1 %cmp3021.not, label %return, label %for.body.preheader, !dbg !15
+
+for.body.preheader:                               ; preds = %if.end26
+  %cmp34 = icmp eq i32 %opt.fca.0.extract, 5
+  %cmp41 = icmp eq i32 %opt.fca.1.extract, 2
+  %cmp50 = icmp eq i32 %opt.fca.2.extract, 3
+  %getElem11 = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv
+  br label %for.body, !dbg !16
+
+for.body:                                         ; preds = %for.body.preheader, %if.end83
+  %i.023 = phi i64 [ %inc90, %if.end83 ], [ 0, %for.body.preheader ]
+  %blockSeed.022 = phi i64 [ %blockSeed.3, %if.end83 ], [ %add, %for.body.preheader ]
+  %opt.elt = extractvalue %struct.GridOpt %opt, 0, !dbg !20
+  %opt.elt25 = extractvalue %struct.GridOpt %opt, 1, !dbg !20
+  %5 = insertelement <2 x i32> poison, i32 %opt.elt, i64 0, !dbg !20
+  %6 = insertelement <2 x i32> %5, i32 %opt.elt25, i64 1, !dbg !20
+  store <2 x i32> %6, ptr addrspace(1) @loopOpt, align 8, !dbg !20
+  %opt.elt26 = extractvalue %struct.GridOpt %opt, 2, !dbg !20
+  store i32 %opt.elt26, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !20
+  %opt.elt27 = extractvalue %struct.GridOpt %opt, 3, !dbg !20
+  store i64 %opt.elt27, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 16), align 8, !dbg !20
+  br i1 %cmp34, label %if.then36, label %if.end38, !dbg !16
+
+if.then36:                                        ; preds = %for.body
+  %and.i = and i64 %blockSeed.022, 65535, !dbg !21
+  %mul.i = mul nuw nsw i64 %and.i, 36969, !dbg !21
+  %add.i = add i64 %mul.i, %blockSeed.022, !dbg !21
+  %shr.i = lshr i64 %add.i, 16, !dbg !21
+  %rem = urem i64 %shr.i, 5, !dbg !29
+  %conv37 = trunc nuw nsw i64 %rem to i32, !dbg !29
+  store i32 %conv37, ptr addrspace(1) @loopOpt, align 8, !dbg !29
+  br label %if.end38, !dbg !29
+
+if.end38:                                         ; preds = %if.then36, %for.body
+  %blockSeed.1 = phi i64 [ %shr.i, %if.then36 ], [ %blockSeed.022, %for.body ], !dbg !30
+  br i1 %cmp41, label %if.then43, label %if.end47, !dbg !30
+
+if.then43:                                        ; preds = %if.end38
+  %and.i13 = and i64 %blockSeed.1, 65535, !dbg !31
+  %mul.i14 = mul nuw nsw i64 %and.i13, 36969, !dbg !31
+  %add.i15 = add i64 %mul.i14, %blockSeed.1, !dbg !31
+  %shr.i16 = lshr i64 %add.i15, 16, !dbg !31
+  %7 = trunc i64 %shr.i16 to i32, !dbg !35
+  %conv46 = and i32 %7, 1, !dbg !35
+  store i32 %conv46, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 4), align 4, !dbg !35
+  br label %if.end47, !dbg !35
+
+if.end47:                                         ; preds = %if.then43, %if.end38
+  %blockSeed.2 = phi i64 [ %shr.i16, %if.then43 ], [ %blockSeed.1, %if.end38 ], !dbg !36
+  br i1 %cmp50, label %if.then52, label %if.end56, !dbg !36
+
+if.then52:                                        ; preds = %if.end47
+  %and.i17 = and i64 %blockSeed.2, 65535, !dbg !37
+  %mul.i18 = mul nuw nsw i64 %and.i17, 36969, !dbg !37
+  %add.i19 = add i64 %mul.i18, %blockSeed.2, !dbg !37
+  %shr.i20 = lshr i64 %add.i19, 16, !dbg !37
+  %rem54 = urem i64 %shr.i20, 3, !dbg !41
+  %conv55 = trunc nuw nsw i64 %rem54 to i32, !dbg !41
+  store i32 %conv55, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !41
+  br label %if.end56, !dbg !41
+
+if.end56:                                         ; preds = %if.then52, %if.end47
+  %blockSeed.3 = phi i64 [ %shr.i20, %if.then52 ], [ %blockSeed.2, %if.end47 ], !dbg !42
+  tail call void @llvm.nvvm.barrier0(), !dbg !42
+  br i1 %cmp20, label %if.then61, label %if.end83, !dbg !43
+
+if.then61:                                        ; preds = %if.end56
+  %tmp67 = load i32, ptr addrspace(1) %getElem11, align 4, !dbg !44
+  %tmp69 = load i32, ptr addrspace(1) %0, align 4, !dbg !46
+  %tobool.not = icmp eq i32 %tmp69, 0, !dbg !46
+  %conv71 = sext i32 %tmp67 to i64, !dbg !46
+  %cmp73 = icmp ne i64 %spec.select, %conv71, !dbg !46
+  %or.cond = select i1 %tobool.not, i1 %cmp73, i1 false, !dbg !46
+  br i1 %or.cond, label %if.then75, label %if.end77, !dbg !46
+
+if.then75:                                        ; preds = %if.then61
+  store i32 3, ptr addrspace(1) %0, align 4, !dbg !47
+  br label %if.end77, !dbg !47
+
+if.end77:                                         ; preds = %if.then61, %if.then75
+  store i32 0, ptr addrspace(1) %getElem11, align 4, !dbg !49
+  br label %if.end83, !dbg !49
+
+if.end83:                                         ; preds = %if.end77, %if.end56
+  tail call void @llvm.nvvm.barrier0(), !dbg !50
+  %tmp85 = load i32, ptr addrspace(1) %0, align 4, !dbg !51
+  %tobool86.not = icmp eq i32 %tmp85, 0, !dbg !51
+  %inc90 = add nuw i64 %i.023, 1
+  %cmp30 = icmp ult i64 %inc90, %opt.fca.3.extract
+  %or.cond24 = select i1 %tobool86.not, i1 %cmp30, i1 false, !dbg !51
+  br i1 %or.cond24, label %for.body, label %return, !dbg !51, !llvm.loop !52
+
+return:                                           ; preds = %if.end83, %if.end26
+  ret void, !dbg !54
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { alwaysinline convergent mustprogress norecurse nounwind willreturn "target-cpu"="sm_75" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t6.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "cnpWideLaunch", linkageName: "_Z13cnpWideLaunch7GridOptyyPiS0_", scope: !1, file: !1, line: 61, type: !5, scopeLine: 61, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 64, column: 5, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 62, column: 1)
+!8 = !DILocation(line: 67, column: 5, scope: !7)
+!9 = !DILocation(line: 70, column: 5, scope: !7)
+!10 = !DILocation(line: 71, column: 5, scope: !7)
+!11 = !DILocation(line: 72, column: 5, scope: !7)
+!12 = !DILocation(line: 77, column: 5, scope: !7)
+!13 = !DILocation(line: 78, column: 9, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !7, file: !1, line: 77, column: 5)
+!15 = !DILocation(line: 81, column: 5, scope: !7)
+!16 = !DILocation(line: 83, column: 9, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 81, column: 5)
+!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 81, column: 5)
+!19 = distinct !DILexicalBlock(scope: !7, file: !1, line: 81, column: 5)
+!20 = !DILocation(line: 82, column: 9, scope: !17)
+!21 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 49, column: 1)
+!23 = distinct !DISubprogram(name: "znew", linkageName: "_Z4znewy", scope: !1, file: !1, line: 48, type: !5, scopeLine: 48, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!24 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 54, column: 1)
+!26 = distinct !DISubprogram(name: "rrand", linkageName: "_Z5rrandPy", scope: !1, file: !1, line: 53, type: !5, scopeLine: 53, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 84, column: 13, scope: !28)
+!28 = distinct !DILexicalBlock(scope: !17, file: !1, line: 83, column: 9)
+!29 = !DILocation(line: 84, column: 13, scope: !28)
+!30 = !DILocation(line: 86, column: 9, scope: !17)
+!31 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !32)
+!32 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !33)
+!33 = distinct !DILocation(line: 87, column: 13, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !17, file: !1, line: 86, column: 9)
+!35 = !DILocation(line: 87, column: 13, scope: !34)
+!36 = !DILocation(line: 89, column: 9, scope: !17)
+!37 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !38)
+!38 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !39)
+!39 = distinct !DILocation(line: 90, column: 13, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !17, file: !1, line: 89, column: 9)
+!41 = !DILocation(line: 90, column: 13, scope: !40)
+!42 = !DILocation(line: 92, column: 9, scope: !17)
+!43 = !DILocation(line: 93, column: 9, scope: !17)
+!44 = !DILocation(line: 95, column: 13, scope: !45)
+!45 = distinct !DILexicalBlock(scope: !17, file: !1, line: 93, column: 9)
+!46 = !DILocation(line: 98, column: 13, scope: !45)
+!47 = !DILocation(line: 99, column: 17, scope: !48)
+!48 = distinct !DILexicalBlock(scope: !45, file: !1, line: 98, column: 13)
+!49 = !DILocation(line: 102, column: 13, scope: !45)
+!50 = !DILocation(line: 104, column: 9, scope: !17)
+!51 = !DILocation(line: 107, column: 9, scope: !17)
+!52 = distinct !{!52, !53}
+!53 = !{!"llvm.loop.mustprogress"}
+!54 = !DILocation(line: 111, column: 1, scope: !7)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
new file mode 100644
index 0000000000000..9a76fe7ac4a1f
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc < %s -mattr=+ptx70 | FileCheck %s --check-prefix=PTXVERSION
+; RUN: llc < %s -mattr=+ptx72 --line-info-inlined-at=false | FileCheck %s --check-prefix=NOFLAG
+;
+;; Test command-line flags to control inlined_at emission - verifies that PTX 7.2+ emits
+;; function_name and inlined_at by default, while PTX 7.0 or --line-info-inlined-at=false disables it.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+;   if (gg > 7)
+;     bar();
+;     calculate();
+; }
+;
+; __device__ void bar() {
+;   if (gg > 17)
+;     foo();
+;     calculate();
+; }
+;
+; __global__ void kernel() {
+;   foo();
+; }
+;
+; DEFAULT: .loc [[FILENUM:[1-9]]] 10
+; DEFAULT: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .section .debug_str
+; DEFAULT: {
+; DEFAULT: [[BARNAME]]:
+; DEFAULT-NEXT: // {{.*}} _Z3barv
+; DEFAULT: }
+
+; NOFLAG-NOT: function_name
+; NOFLAG-NOT: inlined_at {{[1-9]}}
+; NOFLAG-NOT: .section .debug_str
+
+; PTXVERSION-NOT: function_name
+; PTXVERSION-NOT: inlined_at {{[1-9]}}
+; PTXVERSION-NOT: .section .debug_str
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: mustprogress willreturn
+define internal fastcc void @_Z3foov() unnamed_addr #0 !dbg !4 {
+entry:
+  %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+  %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+  br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then:                                          ; preds = %entry
+  %cmp.i = icmp samesign ugt i32 %tmp, 17, !dbg !8
+  br i1 %cmp.i, label %if.then.i, label %_Z3barv.exit, !dbg !8
+
+if.then.i:                                        ; preds = %if.then
+  tail call fastcc void @_Z3foov(), !dbg !13
+  br label %_Z3barv.exit, !dbg !13
+
+_Z3barv.exit:                                     ; preds = %if.then, %if.then.i
+  tail call void @_Z9calculatev(), !dbg !15
+  br label %if.end, !dbg !16
+
+if.end:                                           ; preds = %_Z3barv.exit, %entry
+  tail call void @_Z9calculatev(), !dbg !17
+  ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr #1
+
+; Function Attrs: alwaysinline mustprogress willreturn
+define void @_Z6kernelv() #2 !dbg !19 {
+entry:
+  tail call fastcc void @_Z3foov(), !dbg !20
+  ret void, !dbg !22
+}
+
+attributes #0 = { mustprogress willreturn "target-cpu"="sm_75" }
+attributes #1 = { "target-cpu"="sm_75" }
+attributes #2 = { alwaysinline mustprogress willreturn "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t7.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 15, column: 3, scope: !9, inlinedAt: !11)
+!9 = distinct !DILexicalBlock(scope: !10, file: !1, line: 14, column: 29)
+!10 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = distinct !DILocation(line: 10, column: 5, scope: !12)
+!12 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!13 = !DILocation(line: 16, column: 5, scope: !14, inlinedAt: !11)
+!14 = distinct !DILexicalBlock(scope: !9, file: !1, line: 15, column: 3)
+!15 = !DILocation(line: 17, column: 3, scope: !9, inlinedAt: !11)
+!16 = !DILocation(line: 10, column: 5, scope: !12)
+!17 = !DILocation(line: 11, column: 3, scope: !7)
+!18 = !DILocation(line: 12, column: 1, scope: !7)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!20 = !DILocation(line: 21, column: 3, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!22 = !DILocation(line: 22, column: 1, scope: !21)



More information about the llvm-commits mailing list