[llvm] [DebugInfo][NVPTX] Adding support for `inlined_at` debug directive in NVPTX backend (PR #170239)
Laxman Sole via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 11 18:55:02 PST 2025
https://github.com/laxmansole updated https://github.com/llvm/llvm-project/pull/170239
>From cd4de2f207bb8d16f68a9a5ab91f9339586ebdd0 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Wed, 26 Nov 2025 15:55:11 -0800
Subject: [PATCH 1/3] Adding support for inlined_at debug directive in NVPTX
backend
---
llvm/include/llvm/CodeGen/AsmPrinter.h | 4 +
llvm/include/llvm/MC/MCStreamer.h | 8 +
llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 4 +-
llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 19 +-
llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h | 21 +-
.../CodeGen/AsmPrinter/DwarfStringPool.cpp | 7 +-
llvm/lib/MC/MCAsmStreamer.cpp | 123 +++++--
llvm/lib/Target/NVPTX/CMakeLists.txt | 3 +-
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 5 +
llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h | 3 +
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp | 165 +++++++++
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h | 52 +++
llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll | 144 ++++++++
llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll | 85 +++++
llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll | 305 ++++++++++++++++
llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll | 229 ++++++++++++
llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll | 208 +++++++++++
llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll | 343 ++++++++++++++++++
llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll | 118 ++++++
19 files changed, 1801 insertions(+), 45 deletions(-)
create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
create mode 100644 llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
create mode 100644 llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 7f99e81e7d1a0..fcfe0cebebb43 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -269,6 +269,10 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass {
AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer,
char &ID = AsmPrinter::ID);
+ /// Create the DwarfDebug handler. Targets can override this to provide
+ /// custom debug information handling.
+ virtual DwarfDebug *createDwarfDebug();
+
public:
~AsmPrinter() override;
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 79c715e3820a6..6fe858c3201eb 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -902,6 +902,14 @@ class LLVM_ABI MCStreamer {
StringRef FileName,
StringRef Comment = {});
+ /// This is same as emitDwarfLocDirective, except has capability to
+ /// add inlined_at information.
+ virtual void emitDwarfLocDirectiveWithInlinedAt(
+ unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+ unsigned LineIA, unsigned ColumnIA, const MCSymbol *Sym, unsigned Flags,
+ unsigned Isa, unsigned Discriminator, StringRef FileName,
+ StringRef Comment = {}) {}
+
/// This implements the '.loc_label Name' directive.
virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 965f7f59ce9a9..61784b9f85a28 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -470,6 +470,8 @@ const MCSection *AsmPrinter::getCurrentSection() const {
return OutStreamer->getCurrentSectionOnly();
}
+DwarfDebug *AsmPrinter::createDwarfDebug() { return new DwarfDebug(this); }
+
void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -590,7 +592,7 @@ bool AsmPrinter::doInitialization(Module &M) {
Handlers.push_back(std::make_unique<CodeViewDebug>(this));
if (!EmitCodeView || M.getDwarfVersion()) {
if (hasDebugInfo()) {
- DD = new DwarfDebug(this);
+ DD = createDwarfDebug();
Handlers.push_back(std::unique_ptr<DwarfDebug>(DD));
}
}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 40bfea059c707..3fd60d70cfb68 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2204,13 +2204,27 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
Flags |= DWARF2_FLAG_IS_STMT;
}
- RecordSourceLine(DL, Flags);
+ // Call the hook that allows targets to customize source line recording
+ recordSourceLineHook(*MI, DL, Flags);
// If we're not at line 0, remember this location.
if (DL.getLine())
PrevInstLoc = DL;
}
+// Default implementation of target-specific hook for custom source line
+// recording
+void DwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+ const DebugLoc &DL, unsigned Flags) {
+ SmallString<128> LocationString;
+ if (Asm->OutStreamer->isVerboseAsm()) {
+ raw_svector_ostream OS(LocationString);
+ DL.print(OS);
+ }
+ recordSourceLine(DL.getLine(), DL.getCol(), DL.getScope(), Flags,
+ LocationString);
+}
+
// Returns the position where we should place prologue_end, potentially nullptr,
// which means "no good place to put prologue_end". Returns true in the second
// return value if there are no setup instructions in this function at all,
@@ -2705,6 +2719,9 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
Asm->OutStreamer->getContext().setDwarfCompileUnitID(
getDwarfCompileUnitIDForLineTable(CU));
+ // Call target-specific hook for custom initialization
+ beginFunctionHook(*MF);
+
// Record beginning of function.
PrologEndLoc = emitInitialLocDirective(
*MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 1a1b28a6fc035..58aeb09645984 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -394,9 +394,6 @@ class DwarfDebug : public DebugHandlerBase {
/// table for the same directory as DW_AT_comp_dir.
StringRef CompilationDir;
- /// Holder for the file specific debug information.
- DwarfFile InfoHolder;
-
/// Holders for the various debug information flags that we might need to
/// have exposed. See accessor functions below for description.
@@ -532,10 +529,6 @@ class DwarfDebug : public DebugHandlerBase {
MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
- const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
- return InfoHolder.getUnits();
- }
-
using InlinedEntity = DbgValueHistoryMap::InlinedEntity;
void ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
@@ -711,6 +704,8 @@ class DwarfDebug : public DebugHandlerBase {
void computeKeyInstructions(const MachineFunction *MF);
protected:
+ /// Holder for the file specific debug information.
+ DwarfFile InfoHolder;
/// Gather pre-function debug information.
void beginFunctionImpl(const MachineFunction *MF) override;
@@ -722,6 +717,18 @@ class DwarfDebug : public DebugHandlerBase {
void skippedNonDebugFunction() override;
+ /// Target-specific hook for custom initialization,
+ /// default implementation is empty, only being used for NVPTX target
+ virtual void beginFunctionHook(const MachineFunction &MF) {}
+
+ /// Target-specific hook for custom source line recording
+ virtual void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+ unsigned Flags);
+
+ const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+ return InfoHolder.getUnits();
+ }
+
public:
//===--------------------------------------------------------------------===//
// Main entry points.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index d304c7efe2a75..4ccad65bdba91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -96,9 +96,12 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
if (ShouldCreateSymbols)
Asm.OutStreamer->emitLabel(Entry->getValue().Symbol);
+ // Emit a comment with the string offset and the string itself.
+ Asm.OutStreamer->AddComment(
+ "string offset=" + Twine(Entry->getValue().Offset) + " ; " +
+ StringRef(Entry->getKeyData(), Entry->getKeyLength()));
+
// Emit the string itself with a terminating null byte.
- Asm.OutStreamer->AddComment("string offset=" +
- Twine(Entry->getValue().Offset));
Asm.OutStreamer->emitBytes(
StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
}
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index e2543058394a2..bdfaff0a22417 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -70,6 +70,17 @@ class MCAsmStreamer final : public MCStreamer {
void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+ /// Helper to emit common .loc directive flags, isa, and discriminator
+ void emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+ unsigned Discriminator);
+
+ /// Helper to emit the common suffix of .loc directives (flags, comment, EOL,
+ /// parent call)
+ void emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+ unsigned Column, unsigned Flags,
+ unsigned Isa, unsigned Discriminator,
+ StringRef FileName, StringRef Comment);
+
public:
MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
std::unique_ptr<MCInstPrinter> printer,
@@ -297,6 +308,14 @@ class MCAsmStreamer final : public MCStreamer {
StringRef Location = {}) override;
void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
+ void emitDwarfLocDirectiveWithInlinedAt(unsigned FileNo, unsigned Line,
+ unsigned Column, unsigned FileIA,
+ unsigned LineIA, unsigned ColIA,
+ const MCSymbol *Sym, unsigned Flags,
+ unsigned Isa, unsigned Discriminator,
+ StringRef FileName,
+ StringRef Comment = {}) override;
+
MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
bool emitCVFileDirective(unsigned FileNo, StringRef Filename,
@@ -1675,6 +1694,57 @@ void MCAsmStreamer::emitDwarfFile0Directive(
emitRawText(OS1.str());
}
+/// Helper to emit common .loc directive flags, isa, and discriminator
+void MCAsmStreamer::emitDwarfLocDirectiveFlags(unsigned Flags, unsigned Isa,
+ unsigned Discriminator) {
+ if (!MAI->supportsExtendedDwarfLocDirective())
+ return;
+
+ if (Flags & DWARF2_FLAG_BASIC_BLOCK)
+ OS << " basic_block";
+ if (Flags & DWARF2_FLAG_PROLOGUE_END)
+ OS << " prologue_end";
+ if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
+ OS << " epilogue_begin";
+
+ const unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
+ if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
+ OS << " is_stmt ";
+ OS << ((Flags & DWARF2_FLAG_IS_STMT) ? "1" : "0");
+ }
+
+ if (Isa)
+ OS << " isa " << Isa;
+ if (Discriminator)
+ OS << " discriminator " << Discriminator;
+}
+
+/// Helper to emit the common suffix of .loc directives
+void MCAsmStreamer::emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
+ unsigned Column, unsigned Flags,
+ unsigned Isa,
+ unsigned Discriminator,
+ StringRef FileName,
+ StringRef Comment) {
+ // Emit flags, isa, and discriminator
+ emitDwarfLocDirectiveFlags(Flags, Isa, Discriminator);
+
+ // Emit verbose comment if enabled
+ if (IsVerboseAsm) {
+ OS.PadToColumn(MAI->getCommentColumn());
+ OS << MAI->getCommentString() << ' ';
+ if (Comment.empty())
+ OS << FileName << ':' << Line << ':' << Column;
+ else
+ OS << Comment;
+ }
+
+ // Emit end of line and update parent state
+ EmitEOL();
+ MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
+ Discriminator, FileName, Comment);
+}
+
void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
unsigned Column, unsigned Flags,
unsigned Isa, unsigned Discriminator,
@@ -1691,42 +1761,29 @@ void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
return;
}
+ // Emit the basic .loc directive
OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
- if (MAI->supportsExtendedDwarfLocDirective()) {
- if (Flags & DWARF2_FLAG_BASIC_BLOCK)
- OS << " basic_block";
- if (Flags & DWARF2_FLAG_PROLOGUE_END)
- OS << " prologue_end";
- if (Flags & DWARF2_FLAG_EPILOGUE_BEGIN)
- OS << " epilogue_begin";
-
- unsigned OldFlags = getContext().getCurrentDwarfLoc().getFlags();
- if ((Flags & DWARF2_FLAG_IS_STMT) != (OldFlags & DWARF2_FLAG_IS_STMT)) {
- OS << " is_stmt ";
-
- if (Flags & DWARF2_FLAG_IS_STMT)
- OS << "1";
- else
- OS << "0";
- }
- if (Isa)
- OS << " isa " << Isa;
- if (Discriminator)
- OS << " discriminator " << Discriminator;
- }
+ // Emit common suffix (flags, comment, EOL, parent call)
+ emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+ FileName, Comment);
+}
- if (IsVerboseAsm) {
- OS.PadToColumn(MAI->getCommentColumn());
- OS << MAI->getCommentString() << ' ';
- if (Comment.empty())
- OS << FileName << ':' << Line << ':' << Column;
- else
- OS << Comment;
- }
- EmitEOL();
- this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
- Discriminator, FileName, Comment);
+/// This is same as emitDwarfLocDirective, except also emits inlined function
+/// and inlined callsite information.
+void MCAsmStreamer::emitDwarfLocDirectiveWithInlinedAt(
+ unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
+ unsigned LineIA, unsigned ColIA, const MCSymbol *Sym, unsigned Flags,
+ unsigned Isa, unsigned Discriminator, StringRef FileName,
+ StringRef Comment) {
+ // Emit the basic .loc directive with NVPTX-specific extensions
+ OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
+ OS << ", function_name " << *Sym;
+ OS << ", inlined_at " << FileIA << " " << LineIA << " " << ColIA;
+
+ // Emit common suffix (flags, comment, EOL, parent call)
+ emitDwarfLocDirectiveSuffix(FileNo, Line, Column, Flags, Isa, Discriminator,
+ FileName, Comment);
}
void MCAsmStreamer::emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) {
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 6fe58c25c757d..505e7e945f108 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,12 +18,13 @@ set(NVPTXCodeGen_sources
NVPTXAssignValidGlobalNames.cpp
NVPTXAtomicLower.cpp
NVPTXCtorDtorLowering.cpp
- NVPTXIRPeephole.cpp
+ NVPTXDwarfDebug.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
NVPTXImageOptimizer.cpp
NVPTXInstrInfo.cpp
+ NVPTXIRPeephole.cpp
NVPTXISelDAGToDAG.cpp
NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9bbb3aad89c44..d5c9988d0eb53 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,6 +17,7 @@
#include "MCTargetDesc/NVPTXMCAsmInfo.h"
#include "MCTargetDesc/NVPTXTargetStreamer.h"
#include "NVPTX.h"
+#include "NVPTXDwarfDebug.h"
#include "NVPTXMCExpr.h"
#include "NVPTXMachineFunctionInfo.h"
#include "NVPTXRegisterInfo.h"
@@ -676,6 +677,10 @@ void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
OutStreamer->emitRawText(OS1.str());
}
+DwarfDebug *NVPTXAsmPrinter::createDwarfDebug() {
+ return new NVPTXDwarfDebug(this);
+}
+
bool NVPTXAsmPrinter::doInitialization(Module &M) {
const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
const NVPTXSubtarget &STI = *NTM.getSubtargetImpl();
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index f35931868d99f..20c1bbb9e15b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -194,6 +194,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
bool doInitialization(Module &M) override;
bool doFinalization(Module &M) override;
+ // Create NVPTX-specific DwarfDebug handler
+ DwarfDebug *createDwarfDebug() override;
+
private:
bool GlobalsEmitted;
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
new file mode 100644
index 0000000000000..57410179a6344
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -0,0 +1,165 @@
+//===-- NVPTXDwarfDebug.cpp - NVPTX DwarfDebug Implementation ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for NVPTX-specific debug information
+// processing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXDwarfDebug.h"
+#include "NVPTXSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+// Command line option to control inlined_at enhancement to lineinfo support.
+// Valid only when debuginfo emissionkind is DebugDirectivesOnly or
+// LineTablesOnly.
+static cl::opt<bool> LineInfoWithInlinedAt(
+ "line-info-inlined-at",
+ cl::desc("Emit line with inlined_at enhancement for NVPTX"), cl::init(true),
+ cl::Hidden);
+
+NVPTXDwarfDebug::NVPTXDwarfDebug(AsmPrinter *A) : DwarfDebug(A) {}
+
+// Collect all inlined_at locations for the current function.
+void NVPTXDwarfDebug::collectInlinedAtLocations(const MachineFunction &MF) {
+ const DISubprogram *SP = MF.getFunction().getSubprogram();
+ assert(SP && "expecting valid subprogram here");
+
+ // inlined_at support requires PTX 7.2 or later.
+ const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
+ if (STI.getPTXVersion() < 72)
+ return;
+
+ if (!(SP->getUnit()->isDebugDirectivesOnly() ||
+ SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly) ||
+ !LineInfoWithInlinedAt) // No enhanced lineinfo, we are done.
+ return;
+
+ for (const MachineBasicBlock &MBB : MF) {
+ for (const MachineInstr &MI : MBB) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (!DL)
+ continue;
+ const DILocation *InlinedAt = DL.getInlinedAt();
+ while (InlinedAt) {
+ if (!InlinedAtLocs.insert(InlinedAt).second)
+ break;
+ InlinedAt = InlinedAt->getInlinedAt();
+ }
+ }
+ }
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
+ unsigned Flags) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ // Maintain a work list of .loc to be emitted. If we are emitting the
+ // inlined_at directive, we might need to emit additional .loc prior
+ // to it for the location contained in the inlined_at.
+ SmallVector<const DILocation *, 8> WorkList;
+ DenseSet<const DILocation *> WorkListSet;
+ const DILocation *EmitLoc = DL.get();
+
+ const DISubprogram *SP = MI.getMF()->getFunction().getSubprogram();
+ const NVPTXSubtarget &STI = MI.getMF()->getSubtarget<NVPTXSubtarget>();
+ const bool EnhancedLineinfo =
+ LineInfoWithInlinedAt && (STI.getPTXVersion() >= 72) && SP &&
+ (SP->getUnit()->isDebugDirectivesOnly() ||
+ SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly);
+
+ while (EmitLoc) {
+ // Get the scope for the current location.
+ const DIScope *Scope = EmitLoc->getScope();
+ if (!Scope)
+ break; // scope is null, we are done.
+
+ // Check if this loc is already in work list, if so, we are done.
+ if (WorkListSet.contains(EmitLoc))
+ break;
+
+ // Add this location to the work list.
+ WorkList.push_back(EmitLoc);
+ WorkListSet.insert(EmitLoc);
+
+ if (!EnhancedLineinfo) // No enhanced lineinfo, we are done.
+ break;
+
+ const DILocation *IA = EmitLoc->getInlinedAt();
+ // Check if this has inlined_at information, and if we have not yet
+ // emitted the .loc for the inlined_at location.
+ if (IA && InlinedAtLocs.contains(IA))
+ EmitLoc = IA;
+ else // We are done
+ break;
+ }
+
+ const unsigned CUID = Asm->OutStreamer->getContext().getDwarfCompileUnitID();
+ // Traverse the work list, and emit .loc.
+ while (!WorkList.empty()) {
+ const DILocation *Current = WorkList.pop_back_val();
+ const DIScope *Scope = Current->getScope();
+
+ if (!Scope)
+ llvm_unreachable("we shouldn't be here for null scope");
+
+ const DILocation *InlinedAt = Current->getInlinedAt();
+ StringRef Fn = Scope->getFilename();
+ const unsigned Line = Current->getLine();
+ const unsigned Col = Current->getColumn();
+ unsigned Discriminator = 0;
+ if (Line != 0 && getDwarfVersion() >= 4)
+ if (const DILexicalBlockFile *LBF = dyn_cast<DILexicalBlockFile>(Scope))
+ Discriminator = LBF->getDiscriminator();
+
+ const unsigned FileNo = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+ .getOrCreateSourceID(Scope->getFile());
+ // Remove this location from the work list if it is in the inlined_at
+ // locations set.
+ if (EnhancedLineinfo && InlinedAtLocs.contains(Current))
+ InlinedAtLocs.erase(Current);
+
+ if (EnhancedLineinfo && InlinedAt) {
+ const unsigned FileIA = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
+ .getOrCreateSourceID(InlinedAt->getFile());
+ const DISubprogram *SubProgram = getDISubprogram(Current->getScope());
+ DwarfStringPoolEntryRef Entry = InfoHolder.getStringPool().getEntry(
+ *Asm, SubProgram->getLinkageName());
+ Asm->OutStreamer->emitDwarfLocDirectiveWithInlinedAt(
+ FileNo, Line, Col, FileIA, InlinedAt->getLine(),
+ InlinedAt->getColumn(), Entry.getSymbol(), Flags, 0, Discriminator,
+ Fn);
+ } else {
+ Asm->OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
+ Discriminator, Fn);
+ }
+ }
+}
+
+// NVPTX-specific function initialization hook.
+void NVPTXDwarfDebug::beginFunctionHook(const MachineFunction &MF) {
+ InlinedAtLocs.clear();
+ collectInlinedAtLocations(MF);
+}
+
+// NVPTX-specific source line recording with inlined_at support.
+void NVPTXDwarfDebug::recordSourceLineHook(const MachineInstr &MI,
+ const DebugLoc &DL, unsigned Flags) {
+ // Call NVPTX-specific implementation that handles inlined_at.
+ recordSourceLineAndInlinedAt(MI, Flags);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
new file mode 100644
index 0000000000000..d7032725134e6
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -0,0 +1,52 @@
+//===-- NVPTXDwarfDebug.h - NVPTX DwarfDebug Implementation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares helper classes and functions for NVPTX-specific debug
+// information processing, particularly for inlined function call sites and
+// enhanced line information with inlined_at directives.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
+
+#include "../../CodeGen/AsmPrinter/DwarfCompileUnit.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace llvm {
+
+/// NVPTXDwarfDebug - NVPTX-specific DwarfDebug implementation.
+/// Inherits from DwarfDebug to provide enhanced line information with
+/// inlined_at support.
+class NVPTXDwarfDebug : public DwarfDebug {
+private:
+ /// Set of InlinedAt locations, used to track if these have been emitted.
+ DenseSet<const DILocation *> InlinedAtLocs;
+
+public:
+ /// Constructor - Pass through to DwarfDebug constructor.
+ NVPTXDwarfDebug(AsmPrinter *A);
+
+ /// Collect all inlined_at locations for the current function.
+ void collectInlinedAtLocations(const MachineFunction &MF);
+
+protected:
+ /// Override hook to collect inlined_at locations.
+ void beginFunctionHook(const MachineFunction &MF) override;
+ /// Override hook to record source line information with inlined_at support.
+ void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
+ unsigned Flags) override;
+
+private:
+ /// NVPTX-specific source line recording with inlined_at support.
+ void recordSourceLineAndInlinedAt(const MachineInstr &MI, unsigned Flags);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_NVPTX_NVPTXDWARFDEBUG_H
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
new file mode 100644
index 0000000000000..4ce97900be584
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_1.ll
@@ -0,0 +1,144 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test mutual recursion with deep inlining - verifies that inlined_at information
+;; is correctly emitted for multiple levels of inlining when foo() and bar() call each other.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+; if (gg > 7)
+; bar();
+; calculate();
+; }
+;
+; __device__ void bar() {
+; if (gg > 17)
+; foo();
+; calculate();
+; }
+;
+; __global__ void kernel() {
+; foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 21
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 16
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+define internal fastcc void @_Z3foov() unnamed_addr !dbg !4 {
+entry:
+ %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+ %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+ br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then: ; preds = %entry
+ tail call fastcc void @_Z3barv(), !dbg !8
+ br label %if.end, !dbg !8
+
+if.end: ; preds = %if.then, %entry
+ tail call void @_Z9calculatev(), !dbg !10
+ ret void, !dbg !11
+}
+
+define internal fastcc void @_Z3barv() unnamed_addr !dbg !12 {
+entry:
+ %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !13
+ %cmp = icmp sgt i32 %tmp, 17, !dbg !13
+ br i1 %cmp, label %if.then, label %if.end, !dbg !13
+
+if.then: ; preds = %entry
+ tail call fastcc void @_Z3foov(), !dbg !15
+ br label %if.end, !dbg !15
+
+if.end: ; preds = %if.then, %entry
+ tail call void @_Z9calculatev(), !dbg !17
+ ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr
+
+; Function Attrs: alwaysinline
+define void @_Z6kernelv() #1 !dbg !19 {
+entry:
+ %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !20
+ %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !20
+ br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !20
+
+if.then.i: ; preds = %entry
+ %cmp.i2 = icmp sgt i32 %tmp.i, 17, !dbg !23
+ br i1 %cmp.i2, label %if.then.i10, label %_Z3barv.exit, !dbg !23
+
+if.then.i10: ; preds = %if.then.i
+ tail call fastcc void @_Z3foov(), !dbg !25
+ tail call void @_Z9calculatev(), !dbg !28
+ tail call void @_Z9calculatev(), !dbg !29
+ br label %_Z3barv.exit, !dbg !30
+
+_Z3barv.exit: ; preds = %if.then.i, %if.then.i10
+ tail call void @_Z9calculatev(), !dbg !31
+ br label %_Z3foov.exit, !dbg !32
+
+_Z3foov.exit: ; preds = %entry, %_Z3barv.exit
+ tail call void @_Z9calculatev(), !dbg !33
+ ret void, !dbg !34
+}
+
+attributes #1 = { alwaysinline }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t1.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 10, column: 5, scope: !9)
+!9 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!10 = !DILocation(line: 11, column: 3, scope: !7)
+!11 = !DILocation(line: 12, column: 1, scope: !7)
+!12 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!13 = !DILocation(line: 15, column: 3, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !12, file: !1, line: 14, column: 29)
+!15 = !DILocation(line: 16, column: 5, scope: !16)
+!16 = distinct !DILexicalBlock(scope: !14, file: !1, line: 15, column: 3)
+!17 = !DILocation(line: 17, column: 3, scope: !14)
+!18 = !DILocation(line: 18, column: 1, scope: !14)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!20 = !DILocation(line: 9, column: 3, scope: !7, inlinedAt: !21)
+!21 = distinct !DILocation(line: 21, column: 3, scope: !22)
+!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!23 = !DILocation(line: 15, column: 3, scope: !14, inlinedAt: !24)
+!24 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!25 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !26)
+!26 = distinct !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !27)
+!27 = distinct !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!28 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !26)
+!29 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !27)
+!30 = !DILocation(line: 16, column: 5, scope: !16, inlinedAt: !24)
+!31 = !DILocation(line: 17, column: 3, scope: !14, inlinedAt: !24)
+!32 = !DILocation(line: 10, column: 5, scope: !9, inlinedAt: !21)
+!33 = !DILocation(line: 11, column: 3, scope: !7, inlinedAt: !21)
+!34 = !DILocation(line: 22, column: 1, scope: !22)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
new file mode 100644
index 0000000000000..47e4655a3ef54
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_2.ll
@@ -0,0 +1,85 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test simple two-level inlining - verifies that inlined_at information is correctly
+;; emitted when foo() calls bar() and kernel() calls foo().
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+;
+; __device__ void foo() {
+; if (gg > 7)
+; bar();
+; }
+;
+; __device__ void bar() {
+; ++gg;
+; }
+;
+; __global__ void kernel() {
+; foo();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 18
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3foov
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barv
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelv() #0 !dbg !5 {
+entry:
+ %tmp.i = load i32, ptr addrspace(1) @gg, align 4, !dbg !7
+ %cmp.i = icmp sgt i32 %tmp.i, 7, !dbg !7
+ br i1 %cmp.i, label %if.then.i, label %_Z3foov.exit, !dbg !7
+
+if.then.i: ; preds = %entry
+ %inc.i.i = add nuw nsw i32 %tmp.i, 1, !dbg !12
+ store i32 %inc.i.i, ptr addrspace(1) @gg, align 4, !dbg !12
+ br label %_Z3foov.exit, !dbg !17
+
+_Z3foov.exit: ; preds = %entry, %if.then.i
+ ret void, !dbg !18
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t2.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelv, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 9, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 8, column: 29)
+!9 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 18, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 17, column: 29)
+!12 = !DILocation(line: 14, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 13, column: 29)
+!14 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 13, type: !6, scopeLine: 13, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!16 = distinct !DILexicalBlock(scope: !8, file: !1, line: 9, column: 3)
+!17 = !DILocation(line: 10, column: 5, scope: !16, inlinedAt: !10)
+!18 = !DILocation(line: 19, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
new file mode 100644
index 0000000000000..5b94a249ab64d
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_3.ll
@@ -0,0 +1,305 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test inlining of a C++ constructor with control flow - verifies that inlined_at
+;; information is correctly emitted when a constructor containing loops and conditionals is inlined.
+;
+; __device__ int gg;
+; __device__ int *arr;
+;
+; class C {
+; int priv;
+; public: __device__ C();
+; __device__ C(int);
+; __device__ int get() const;
+; };
+;
+;
+; __device__ C::C() : priv(1) {
+; int sum = 0;
+; for (int i = 0; i < gg; ++i) sum += arr[i];
+; if (sum > 17)
+; priv = sum;
+; }
+;
+; __device__ C::C(int n) : priv(n) {}
+;
+; __device__ int C::get() const { return priv; }
+;
+; __global__ void kernel(int n) {
+; C c1;
+; if (n > 7)
+; gg = c1.get();
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 24
+; CHECK: .loc [[FILENUM]] 14 {{[0-9]*}}, function_name [[CTORNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[CTORNAME]]:
+; CHECK-NEXT: // {{.*}} _ZN1CC1Ev
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at arr = internal addrspace(1) global ptr null, align 8
+ at llvm.used = appending global [3 x ptr] [ptr @_Z6kerneli, ptr addrspacecast (ptr addrspace(1) @arr to ptr), ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none)
+define void @_Z6kerneli(i32 noundef %n) #0 !dbg !4 {
+entry:
+ %tmp3.i1 = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+ %cmp.i2 = icmp sgt i32 %tmp3.i1, 0, !dbg !6
+ br i1 %cmp.i2, label %for.body.i.preheader, label %for.end.i, !dbg !6
+
+for.body.i.preheader: ; preds = %entry
+ %tmp4.i = load ptr, ptr addrspace(1) @arr, align 8
+ %0 = addrspacecast ptr %tmp4.i to ptr addrspace(1)
+ %xtraiter = and i32 %tmp3.i1, 15, !dbg !6
+ %1 = icmp samesign ult i32 %tmp3.i1, 16, !dbg !6
+ br i1 %1, label %for.end.i.loopexit.unr-lcssa, label %for.body.i.preheader.new, !dbg !6
+
+for.body.i.preheader.new: ; preds = %for.body.i.preheader
+ %unroll_iter = and i32 %tmp3.i1, 2147483632, !dbg !6
+ br label %for.body.i, !dbg !6
+
+for.body.i: ; preds = %for.body.i, %for.body.i.preheader.new
+ %i.0.i4 = phi i32 [ 0, %for.body.i.preheader.new ], [ %inc.i.15, %for.body.i ]
+ %sum.0.i3 = phi i32 [ 0, %for.body.i.preheader.new ], [ %add.i.15, %for.body.i ]
+ %niter = phi i32 [ 0, %for.body.i.preheader.new ], [ %niter.next.15, %for.body.i ]
+ %2 = zext nneg i32 %i.0.i4 to i64, !dbg !6
+ %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %2, !dbg !6
+ %tmp6.i = load i32, ptr addrspace(1) %getElem, align 4, !dbg !6
+ %add.i = add nsw i32 %tmp6.i, %sum.0.i3, !dbg !6
+ %inc.i = add nuw nsw i32 %i.0.i4, 1, !dbg !6
+ %3 = zext nneg i32 %inc.i to i64, !dbg !6
+ %getElem.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %3, !dbg !6
+ %tmp6.i.1 = load i32, ptr addrspace(1) %getElem.1, align 4, !dbg !6
+ %add.i.1 = add nsw i32 %tmp6.i.1, %add.i, !dbg !6
+ %inc.i.1 = add nuw nsw i32 %i.0.i4, 2, !dbg !6
+ %4 = zext nneg i32 %inc.i.1 to i64, !dbg !6
+ %getElem.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %4, !dbg !6
+ %tmp6.i.2 = load i32, ptr addrspace(1) %getElem.2, align 4, !dbg !6
+ %add.i.2 = add nsw i32 %tmp6.i.2, %add.i.1, !dbg !6
+ %inc.i.2 = add nuw nsw i32 %i.0.i4, 3, !dbg !6
+ %5 = zext nneg i32 %inc.i.2 to i64, !dbg !6
+ %getElem.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %5, !dbg !6
+ %tmp6.i.3 = load i32, ptr addrspace(1) %getElem.3, align 4, !dbg !6
+ %add.i.3 = add nsw i32 %tmp6.i.3, %add.i.2, !dbg !6
+ %inc.i.3 = add nuw nsw i32 %i.0.i4, 4, !dbg !6
+ %6 = zext nneg i32 %inc.i.3 to i64, !dbg !6
+ %getElem.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %6, !dbg !6
+ %tmp6.i.4 = load i32, ptr addrspace(1) %getElem.4, align 4, !dbg !6
+ %add.i.4 = add nsw i32 %tmp6.i.4, %add.i.3, !dbg !6
+ %inc.i.4 = add nuw nsw i32 %i.0.i4, 5, !dbg !6
+ %7 = zext nneg i32 %inc.i.4 to i64, !dbg !6
+ %getElem.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %7, !dbg !6
+ %tmp6.i.5 = load i32, ptr addrspace(1) %getElem.5, align 4, !dbg !6
+ %add.i.5 = add nsw i32 %tmp6.i.5, %add.i.4, !dbg !6
+ %inc.i.5 = add nuw nsw i32 %i.0.i4, 6, !dbg !6
+ %8 = zext nneg i32 %inc.i.5 to i64, !dbg !6
+ %getElem.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %8, !dbg !6
+ %tmp6.i.6 = load i32, ptr addrspace(1) %getElem.6, align 4, !dbg !6
+ %add.i.6 = add nsw i32 %tmp6.i.6, %add.i.5, !dbg !6
+ %inc.i.6 = add nuw nsw i32 %i.0.i4, 7, !dbg !6
+ %9 = zext nneg i32 %inc.i.6 to i64, !dbg !6
+ %getElem.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %9, !dbg !6
+ %tmp6.i.7 = load i32, ptr addrspace(1) %getElem.7, align 4, !dbg !6
+ %add.i.7 = add nsw i32 %tmp6.i.7, %add.i.6, !dbg !6
+ %inc.i.7 = add nuw nsw i32 %i.0.i4, 8, !dbg !6
+ %10 = zext nneg i32 %inc.i.7 to i64, !dbg !6
+ %getElem.8 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %10, !dbg !6
+ %tmp6.i.8 = load i32, ptr addrspace(1) %getElem.8, align 4, !dbg !6
+ %add.i.8 = add nsw i32 %tmp6.i.8, %add.i.7, !dbg !6
+ %inc.i.8 = add nuw nsw i32 %i.0.i4, 9, !dbg !6
+ %11 = zext nneg i32 %inc.i.8 to i64, !dbg !6
+ %getElem.9 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %11, !dbg !6
+ %tmp6.i.9 = load i32, ptr addrspace(1) %getElem.9, align 4, !dbg !6
+ %add.i.9 = add nsw i32 %tmp6.i.9, %add.i.8, !dbg !6
+ %inc.i.9 = add nuw nsw i32 %i.0.i4, 10, !dbg !6
+ %12 = zext nneg i32 %inc.i.9 to i64, !dbg !6
+ %getElem.10 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %12, !dbg !6
+ %tmp6.i.10 = load i32, ptr addrspace(1) %getElem.10, align 4, !dbg !6
+ %add.i.10 = add nsw i32 %tmp6.i.10, %add.i.9, !dbg !6
+ %inc.i.10 = add nuw nsw i32 %i.0.i4, 11, !dbg !6
+ %13 = zext nneg i32 %inc.i.10 to i64, !dbg !6
+ %getElem.11 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %13, !dbg !6
+ %tmp6.i.11 = load i32, ptr addrspace(1) %getElem.11, align 4, !dbg !6
+ %add.i.11 = add nsw i32 %tmp6.i.11, %add.i.10, !dbg !6
+ %inc.i.11 = add nuw nsw i32 %i.0.i4, 12, !dbg !6
+ %14 = zext nneg i32 %inc.i.11 to i64, !dbg !6
+ %getElem.12 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %14, !dbg !6
+ %tmp6.i.12 = load i32, ptr addrspace(1) %getElem.12, align 4, !dbg !6
+ %add.i.12 = add nsw i32 %tmp6.i.12, %add.i.11, !dbg !6
+ %inc.i.12 = add nuw nsw i32 %i.0.i4, 13, !dbg !6
+ %15 = zext nneg i32 %inc.i.12 to i64, !dbg !6
+ %getElem.13 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %15, !dbg !6
+ %tmp6.i.13 = load i32, ptr addrspace(1) %getElem.13, align 4, !dbg !6
+ %add.i.13 = add nsw i32 %tmp6.i.13, %add.i.12, !dbg !6
+ %inc.i.13 = add nuw nsw i32 %i.0.i4, 14, !dbg !6
+ %16 = zext nneg i32 %inc.i.13 to i64, !dbg !6
+ %getElem.14 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %16, !dbg !6
+ %tmp6.i.14 = load i32, ptr addrspace(1) %getElem.14, align 4, !dbg !6
+ %add.i.14 = add nsw i32 %tmp6.i.14, %add.i.13, !dbg !6
+ %inc.i.14 = add nuw nsw i32 %i.0.i4, 15, !dbg !6
+ %17 = zext nneg i32 %inc.i.14 to i64, !dbg !6
+ %getElem.15 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %17, !dbg !6
+ %tmp6.i.15 = load i32, ptr addrspace(1) %getElem.15, align 4, !dbg !6
+ %add.i.15 = add nsw i32 %tmp6.i.15, %add.i.14, !dbg !6
+ %inc.i.15 = add nuw nsw i32 %i.0.i4, 16, !dbg !6
+ %niter.next.15 = add i32 %niter, 16, !dbg !6
+ %niter.ncmp.15.not = icmp eq i32 %niter.next.15, %unroll_iter, !dbg !6
+ br i1 %niter.ncmp.15.not, label %for.end.i.loopexit.unr-lcssa, label %for.body.i, !dbg !6, !llvm.loop !11
+
+for.end.i.loopexit.unr-lcssa: ; preds = %for.body.i, %for.body.i.preheader
+ %add.i.lcssa.ph = phi i32 [ poison, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+ %i.0.i4.unr = phi i32 [ 0, %for.body.i.preheader ], [ %inc.i.15, %for.body.i ]
+ %sum.0.i3.unr = phi i32 [ 0, %for.body.i.preheader ], [ %add.i.15, %for.body.i ]
+ %lcmp.mod.not = icmp eq i32 %xtraiter, 0, !dbg !6
+ br i1 %lcmp.mod.not, label %for.end.i, label %for.body.i.epil.preheader, !dbg !6
+
+for.body.i.epil.preheader: ; preds = %for.end.i.loopexit.unr-lcssa
+ %xtraiter6 = and i32 %tmp3.i1, 7, !dbg !6
+ %18 = icmp samesign ult i32 %xtraiter, 8, !dbg !6
+ br i1 %18, label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, label %for.body.i.epil, !dbg !6
+
+for.body.i.epil: ; preds = %for.body.i.epil.preheader
+ %19 = zext nneg i32 %i.0.i4.unr to i64, !dbg !6
+ %getElem.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %19, !dbg !6
+ %tmp6.i.epil = load i32, ptr addrspace(1) %getElem.epil, align 4, !dbg !6
+ %add.i.epil = add nsw i32 %tmp6.i.epil, %sum.0.i3.unr, !dbg !6
+ %inc.i.epil = add nuw nsw i32 %i.0.i4.unr, 1, !dbg !6
+ %20 = zext nneg i32 %inc.i.epil to i64, !dbg !6
+ %getElem.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %20, !dbg !6
+ %tmp6.i.epil.1 = load i32, ptr addrspace(1) %getElem.epil.1, align 4, !dbg !6
+ %add.i.epil.1 = add nsw i32 %tmp6.i.epil.1, %add.i.epil, !dbg !6
+ %inc.i.epil.1 = add nuw nsw i32 %i.0.i4.unr, 2, !dbg !6
+ %21 = zext nneg i32 %inc.i.epil.1 to i64, !dbg !6
+ %getElem.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %21, !dbg !6
+ %tmp6.i.epil.2 = load i32, ptr addrspace(1) %getElem.epil.2, align 4, !dbg !6
+ %add.i.epil.2 = add nsw i32 %tmp6.i.epil.2, %add.i.epil.1, !dbg !6
+ %inc.i.epil.2 = add nuw nsw i32 %i.0.i4.unr, 3, !dbg !6
+ %22 = zext nneg i32 %inc.i.epil.2 to i64, !dbg !6
+ %getElem.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %22, !dbg !6
+ %tmp6.i.epil.3 = load i32, ptr addrspace(1) %getElem.epil.3, align 4, !dbg !6
+ %add.i.epil.3 = add nsw i32 %tmp6.i.epil.3, %add.i.epil.2, !dbg !6
+ %inc.i.epil.3 = add nuw nsw i32 %i.0.i4.unr, 4, !dbg !6
+ %23 = zext nneg i32 %inc.i.epil.3 to i64, !dbg !6
+ %getElem.epil.4 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %23, !dbg !6
+ %tmp6.i.epil.4 = load i32, ptr addrspace(1) %getElem.epil.4, align 4, !dbg !6
+ %add.i.epil.4 = add nsw i32 %tmp6.i.epil.4, %add.i.epil.3, !dbg !6
+ %inc.i.epil.4 = add nuw nsw i32 %i.0.i4.unr, 5, !dbg !6
+ %24 = zext nneg i32 %inc.i.epil.4 to i64, !dbg !6
+ %getElem.epil.5 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %24, !dbg !6
+ %tmp6.i.epil.5 = load i32, ptr addrspace(1) %getElem.epil.5, align 4, !dbg !6
+ %add.i.epil.5 = add nsw i32 %tmp6.i.epil.5, %add.i.epil.4, !dbg !6
+ %inc.i.epil.5 = add nuw nsw i32 %i.0.i4.unr, 6, !dbg !6
+ %25 = zext nneg i32 %inc.i.epil.5 to i64, !dbg !6
+ %getElem.epil.6 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %25, !dbg !6
+ %tmp6.i.epil.6 = load i32, ptr addrspace(1) %getElem.epil.6, align 4, !dbg !6
+ %add.i.epil.6 = add nsw i32 %tmp6.i.epil.6, %add.i.epil.5, !dbg !6
+ %inc.i.epil.6 = add nuw nsw i32 %i.0.i4.unr, 7, !dbg !6
+ %26 = zext nneg i32 %inc.i.epil.6 to i64, !dbg !6
+ %getElem.epil.7 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %26, !dbg !6
+ %tmp6.i.epil.7 = load i32, ptr addrspace(1) %getElem.epil.7, align 4, !dbg !6
+ %add.i.epil.7 = add nsw i32 %tmp6.i.epil.7, %add.i.epil.6, !dbg !6
+ %inc.i.epil.7 = add nuw nsw i32 %i.0.i4.unr, 8, !dbg !6
+ br label %for.end.i.loopexit.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.unr-lcssa: ; preds = %for.body.i.epil, %for.body.i.epil.preheader
+ %add.i.lcssa.ph5.ph = phi i32 [ poison, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+ %i.0.i4.epil.unr = phi i32 [ %i.0.i4.unr, %for.body.i.epil.preheader ], [ %inc.i.epil.7, %for.body.i.epil ]
+ %sum.0.i3.epil.unr = phi i32 [ %sum.0.i3.unr, %for.body.i.epil.preheader ], [ %add.i.epil.7, %for.body.i.epil ]
+ %lcmp.mod8.not = icmp eq i32 %xtraiter6, 0, !dbg !6
+ br i1 %lcmp.mod8.not, label %for.end.i, label %for.body.i.epil.epil.preheader, !dbg !6
+
+for.body.i.epil.epil.preheader: ; preds = %for.end.i.loopexit.epilog-lcssa.unr-lcssa
+ %xtraiter12 = and i32 %tmp3.i1, 3, !dbg !6
+ %27 = icmp samesign ult i32 %xtraiter6, 4, !dbg !6
+ br i1 %27, label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, label %for.body.i.epil.epil, !dbg !6
+
+for.body.i.epil.epil: ; preds = %for.body.i.epil.epil.preheader
+ %28 = zext nneg i32 %i.0.i4.epil.unr to i64, !dbg !6
+ %getElem.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %28, !dbg !6
+ %tmp6.i.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil, align 4, !dbg !6
+ %add.i.epil.epil = add nsw i32 %tmp6.i.epil.epil, %sum.0.i3.epil.unr, !dbg !6
+ %inc.i.epil.epil = add nuw nsw i32 %i.0.i4.epil.unr, 1, !dbg !6
+ %29 = zext nneg i32 %inc.i.epil.epil to i64, !dbg !6
+ %getElem.epil.epil.1 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %29, !dbg !6
+ %tmp6.i.epil.epil.1 = load i32, ptr addrspace(1) %getElem.epil.epil.1, align 4, !dbg !6
+ %add.i.epil.epil.1 = add nsw i32 %tmp6.i.epil.epil.1, %add.i.epil.epil, !dbg !6
+ %inc.i.epil.epil.1 = add nuw nsw i32 %i.0.i4.epil.unr, 2, !dbg !6
+ %30 = zext nneg i32 %inc.i.epil.epil.1 to i64, !dbg !6
+ %getElem.epil.epil.2 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %30, !dbg !6
+ %tmp6.i.epil.epil.2 = load i32, ptr addrspace(1) %getElem.epil.epil.2, align 4, !dbg !6
+ %add.i.epil.epil.2 = add nsw i32 %tmp6.i.epil.epil.2, %add.i.epil.epil.1, !dbg !6
+ %inc.i.epil.epil.2 = add nuw nsw i32 %i.0.i4.epil.unr, 3, !dbg !6
+ %31 = zext nneg i32 %inc.i.epil.epil.2 to i64, !dbg !6
+ %getElem.epil.epil.3 = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %31, !dbg !6
+ %tmp6.i.epil.epil.3 = load i32, ptr addrspace(1) %getElem.epil.epil.3, align 4, !dbg !6
+ %add.i.epil.epil.3 = add nsw i32 %tmp6.i.epil.epil.3, %add.i.epil.epil.2, !dbg !6
+ %inc.i.epil.epil.3 = add nuw nsw i32 %i.0.i4.epil.unr, 4, !dbg !6
+ br label %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, !dbg !6
+
+for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa: ; preds = %for.body.i.epil.epil, %for.body.i.epil.epil.preheader
+ %add.i.lcssa.ph5.ph9.ph = phi i32 [ poison, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+ %i.0.i4.epil.epil.unr = phi i32 [ %i.0.i4.epil.unr, %for.body.i.epil.epil.preheader ], [ %inc.i.epil.epil.3, %for.body.i.epil.epil ]
+ %sum.0.i3.epil.epil.unr = phi i32 [ %sum.0.i3.epil.unr, %for.body.i.epil.epil.preheader ], [ %add.i.epil.epil.3, %for.body.i.epil.epil ]
+ %lcmp.mod14.not = icmp eq i32 %xtraiter12, 0, !dbg !6
+ br i1 %lcmp.mod14.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6
+
+for.body.i.epil.epil.epil: ; preds = %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.body.i.epil.epil.epil
+ %i.0.i4.epil.epil.epil = phi i32 [ %inc.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %i.0.i4.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+ %sum.0.i3.epil.epil.epil = phi i32 [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], [ %sum.0.i3.epil.epil.unr, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+ %epil.iter13 = phi i32 [ %epil.iter13.next, %for.body.i.epil.epil.epil ], [ 0, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ]
+ %32 = zext nneg i32 %i.0.i4.epil.epil.epil to i64, !dbg !6
+ %getElem.epil.epil.epil = getelementptr inbounds nuw i32, ptr addrspace(1) %0, i64 %32, !dbg !6
+ %tmp6.i.epil.epil.epil = load i32, ptr addrspace(1) %getElem.epil.epil.epil, align 4, !dbg !6
+ %add.i.epil.epil.epil = add nsw i32 %tmp6.i.epil.epil.epil, %sum.0.i3.epil.epil.epil, !dbg !6
+ %inc.i.epil.epil.epil = add nuw nsw i32 %i.0.i4.epil.epil.epil, 1, !dbg !6
+ %epil.iter13.next = add i32 %epil.iter13, 1, !dbg !6
+ %epil.iter13.cmp.not = icmp eq i32 %epil.iter13.next, %xtraiter12, !dbg !6
+ br i1 %epil.iter13.cmp.not, label %for.end.i, label %for.body.i.epil.epil.epil, !dbg !6, !llvm.loop !14
+
+for.end.i: ; preds = %for.body.i.epil.epil.epil, %for.end.i.loopexit.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa, %for.end.i.loopexit.epilog-lcssa.unr-lcssa, %entry
+ %sum.0.i.lcssa = phi i32 [ 0, %entry ], [ %add.i.lcssa.ph, %for.end.i.loopexit.unr-lcssa ], [ %add.i.lcssa.ph5.ph, %for.end.i.loopexit.epilog-lcssa.unr-lcssa ], [ %add.i.lcssa.ph5.ph9.ph, %for.end.i.loopexit.epilog-lcssa.epilog-lcssa.unr-lcssa ], [ %add.i.epil.epil.epil, %for.body.i.epil.epil.epil ], !dbg !6
+ %cmp10.i = icmp sgt i32 %sum.0.i.lcssa, 17, !dbg !15
+ %spec.select = select i1 %cmp10.i, i32 %sum.0.i.lcssa, i32 1, !dbg !15
+ %cmp = icmp sgt i32 %n, 7, !dbg !16
+ br i1 %cmp, label %if.then, label %if.end, !dbg !16
+
+if.then: ; preds = %for.end.i
+ store i32 %spec.select, ptr addrspace(1) @gg, align 4, !dbg !17
+ br label %if.end, !dbg !17
+
+if.end: ; preds = %if.then, %for.end.i
+ ret void, !dbg !19
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(readwrite, argmem: read, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t3.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kerneli", scope: !1, file: !1, line: 23, type: !5, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 14, column: 3, scope: !7, inlinedAt: !9)
+!7 = distinct !DILexicalBlock(scope: !8, file: !1, line: 12, column: 27)
+!8 = distinct !DISubprogram(name: "C", linkageName: "_ZN1CC1Ev", scope: !1, file: !1, line: 12, type: !5, scopeLine: 12, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!9 = distinct !DILocation(line: 24, column: 3, scope: !10)
+!10 = distinct !DILexicalBlock(scope: !4, file: !1, line: 23, column: 29)
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.mustprogress"}
+!13 = !{!"llvm.loop.unroll.disable"}
+!14 = distinct !{!14, !13}
+!15 = !DILocation(line: 15, column: 3, scope: !7, inlinedAt: !9)
+!16 = !DILocation(line: 25, column: 3, scope: !10)
+!17 = !DILocation(line: 26, column: 5, scope: !18)
+!18 = distinct !DILexicalBlock(scope: !10, file: !1, line: 25, column: 3)
+!19 = !DILocation(line: 27, column: 1, scope: !10)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
new file mode 100644
index 0000000000000..57df39e8034c4
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
@@ -0,0 +1,229 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test multiple inline calls at the same level - verifies that inlined_at information
+;; is correctly emitted when multiple different functions (or multiple copies of the same function) are inlined into a single caller.
+;
+; __device__ __forceinline__ int foo(int a)
+; {
+; if (a > 7)
+; return a*a;
+; return ++a;
+; }
+;
+; __device__ __forceinline__ int baz(int a)
+; {
+; if (a > 23)
+; return a*2;
+; return ++a;
+; }
+;
+; __device__ int bar(int i, int j)
+; {
+; return i + j;
+; }
+;
+; __device__ int d;
+;
+; // inlining two different functions
+; __global__ void kernel1(int x, int y)
+; {
+; d = bar(foo(x), baz(y));
+; }
+;
+; // inlining two copies of same function
+; __global__ void kernel2(int x, int y)
+; {
+; d = bar(foo(x), foo(y));
+; }
+;
+; // inlining two different functions, extra computation in caller (y+x)
+; __global__ void kernel3(int x, int y)
+; {
+; d = bar(foo(x), baz(y + x));
+; }
+;
+; // inlining two copies of same function, extra computation in caller (y+x)
+; __global__ void kernel4(int x, int y)
+; {
+; d = bar(foo(x), foo(y + x));
+; }
+;
+; CHECK: .entry _Z7kernel1ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+; CHECK: .loc [[FILENUM:[1-9]]] 25
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 25
+;
+; CHECK: .entry _Z7kernel2ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 31
+; CHECK: .loc [[FILENUM:[1-9]]] 31
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 31
+;
+; CHECK: .entry _Z7kernel3ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 10 {{[0-9]*}}, function_name [[BAZNAME]], inlined_at [[FILENUM]] 37
+; CHECK: .loc [[FILENUM:[1-9]]] 37
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 37
+;
+; CHECK: .entry _Z7kernel4ii(
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOONAME]], inlined_at [[FILENUM]] 43
+; CHECK: .loc [[FILENUM:[1-9]]] 43
+; CHECK: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 43
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOONAME]]:
+; CHECK-NEXT: // {{.*}} _Z3fooi
+; CHECK: [[BAZNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3bazi
+; CHECK: [[BARNAME]]:
+; CHECK-NEXT: // {{.*}} _Z3barii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at d = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [5 x ptr] [ptr @_Z7kernel1ii, ptr @_Z7kernel2ii, ptr @_Z7kernel3ii, ptr @_Z7kernel4ii, ptr addrspacecast (ptr addrspace(1) @d to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel1ii(i32 noundef %x, i32 noundef %y) #0 !dbg !11 {
+entry:
+ %cmp.i = icmp sgt i32 %x, 7, !dbg !13
+ %mul.i = mul nsw i32 %x, %x, !dbg !13
+ %inc.i = add nsw i32 %x, 1, !dbg !13
+ %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !13
+ %cmp.i1 = icmp sgt i32 %y, 23, !dbg !18
+ %mul.i2 = shl nuw nsw i32 %y, 1, !dbg !18
+ %inc.i3 = add nsw i32 %y, 1, !dbg !18
+ %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !18
+ %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !22
+ store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !26
+ ret void, !dbg !27
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel2ii(i32 noundef %x, i32 noundef %y) #0 !dbg !28 {
+entry:
+ %cmp.i = icmp sgt i32 %x, 7, !dbg !29
+ %mul.i = mul nsw i32 %x, %x, !dbg !29
+ %inc.i = add nsw i32 %x, 1, !dbg !29
+ %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !29
+ %cmp.i1 = icmp sgt i32 %y, 7, !dbg !32
+ %mul.i2 = mul nsw i32 %y, %y, !dbg !32
+ %inc.i3 = add nsw i32 %y, 1, !dbg !32
+ %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !32
+ %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !34
+ store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !36
+ ret void, !dbg !37
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel3ii(i32 noundef %x, i32 noundef %y) #0 !dbg !38 {
+entry:
+ %cmp.i = icmp sgt i32 %x, 7, !dbg !39
+ %mul.i = mul nsw i32 %x, %x, !dbg !39
+ %inc.i = add nsw i32 %x, 1, !dbg !39
+ %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !39
+ %add = add nsw i32 %y, %x, !dbg !42
+ %cmp.i1 = icmp sgt i32 %add, 23, !dbg !43
+ %mul.i2 = shl nuw nsw i32 %add, 1, !dbg !43
+ %inc.i3 = add nsw i32 %add, 1, !dbg !43
+ %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !43
+ %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !45
+ store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !42
+ ret void, !dbg !47
+}
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z7kernel4ii(i32 noundef %x, i32 noundef %y) #0 !dbg !48 {
+entry:
+ %cmp.i = icmp sgt i32 %x, 7, !dbg !49
+ %mul.i = mul nsw i32 %x, %x, !dbg !49
+ %inc.i = add nsw i32 %x, 1, !dbg !49
+ %retval.0.i = select i1 %cmp.i, i32 %mul.i, i32 %inc.i, !dbg !49
+ %add = add nsw i32 %y, %x, !dbg !52
+ %cmp.i1 = icmp sgt i32 %add, 7, !dbg !53
+ %mul.i2 = mul nsw i32 %add, %add, !dbg !53
+ %inc.i3 = add nsw i32 %add, 1, !dbg !53
+ %retval.0.i4 = select i1 %cmp.i1, i32 %mul.i2, i32 %inc.i3, !dbg !53
+ %add.i = add nsw i32 %retval.0.i4, %retval.0.i, !dbg !55
+ store i32 %add.i, ptr addrspace(1) @d, align 4, !dbg !52
+ ret void, !dbg !57
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvmir.version = !{!2, !3, !4}
+!nvvm.annotations = !{!5, !6, !7, !8}
+!llvm.module.flags = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t4.cu", directory: "")
+!2 = !{i32 2, i32 0, i32 3, i32 2}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, i32 0, i32 3, i32 1}
+!5 = !{ptr @_Z7kernel1ii, !"kernel", i32 1}
+!6 = !{ptr @_Z7kernel2ii, !"kernel", i32 1}
+!7 = !{ptr @_Z7kernel3ii, !"kernel", i32 1}
+!8 = !{ptr @_Z7kernel4ii, !"kernel", i32 1}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{}
+!11 = distinct !DISubprogram(name: "kernel1", linkageName: "_Z7kernel1ii", scope: !1, file: !1, line: 23, type: !12, scopeLine: 23, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!12 = !DISubroutineType(types: !10)
+!13 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !16)
+!14 = distinct !DILexicalBlock(scope: !15, file: !1, line: 2, column: 1)
+!15 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!16 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !11, file: !1, line: 24, column: 1)
+!18 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !21)
+!19 = distinct !DILexicalBlock(scope: !20, file: !1, line: 9, column: 1)
+!20 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazi", scope: !1, file: !1, line: 8, type: !12, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!21 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!22 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !25)
+!23 = distinct !DILexicalBlock(scope: !24, file: !1, line: 16, column: 1)
+!24 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barii", scope: !1, file: !1, line: 15, type: !12, scopeLine: 15, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!25 = distinct !DILocation(line: 25, column: 3, scope: !17)
+!26 = !DILocation(line: 25, column: 3, scope: !17)
+!27 = !DILocation(line: 26, column: 1, scope: !17)
+!28 = distinct !DISubprogram(name: "kernel2", linkageName: "_Z7kernel2ii", scope: !1, file: !1, line: 29, type: !12, scopeLine: 29, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!29 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !30)
+!30 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !28, file: !1, line: 30, column: 1)
+!32 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !33)
+!33 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!34 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !35)
+!35 = distinct !DILocation(line: 31, column: 3, scope: !31)
+!36 = !DILocation(line: 31, column: 3, scope: !31)
+!37 = !DILocation(line: 32, column: 1, scope: !31)
+!38 = distinct !DISubprogram(name: "kernel3", linkageName: "_Z7kernel3ii", scope: !1, file: !1, line: 35, type: !12, scopeLine: 35, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!39 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !40)
+!40 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!41 = distinct !DILexicalBlock(scope: !38, file: !1, line: 36, column: 1)
+!42 = !DILocation(line: 37, column: 3, scope: !41)
+!43 = !DILocation(line: 10, column: 3, scope: !19, inlinedAt: !44)
+!44 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!45 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !46)
+!46 = distinct !DILocation(line: 37, column: 3, scope: !41)
+!47 = !DILocation(line: 38, column: 1, scope: !41)
+!48 = distinct !DISubprogram(name: "kernel4", linkageName: "_Z7kernel4ii", scope: !1, file: !1, line: 41, type: !12, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!49 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !50)
+!50 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!51 = distinct !DILexicalBlock(scope: !48, file: !1, line: 42, column: 1)
+!52 = !DILocation(line: 43, column: 3, scope: !51)
+!53 = !DILocation(line: 3, column: 3, scope: !14, inlinedAt: !54)
+!54 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!55 = !DILocation(line: 17, column: 3, scope: !23, inlinedAt: !56)
+!56 = distinct !DILocation(line: 43, column: 3, scope: !51)
+!57 = !DILocation(line: 44, column: 1, scope: !51)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
new file mode 100644
index 0000000000000..dc242a95f54d8
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_5.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s
+;
+;; Test deep inline chain - verifies that inlined_at information is correctly emitted
+;; through an 11-level deep chain of inlining (foo0 through foo10).
+;
+; __device__ int foo0(int a, int b) {
+; if (a > b)
+; return --a;
+; if (a > b - 7)
+; return a*2;
+; return ++a;
+; }
+; __device__ int foo1(int a, int b) {
+; return foo0(a*3, b*b);
+; }
+; __device__ int foo2(int a, int b) {
+; return foo1(a+2, b*b);
+; }
+; __device__ int foo3(int a, int b) {
+; return foo2(a+100, b-7);
+; }
+; __device__ int foo4(int a, int b) {
+; return foo3(a*a, b*3);
+; }
+; __device__ int foo5(int a, int b) {
+; return foo4(a*3, b*3);
+; }
+; __device__ int foo6(int a, int b) {
+; return foo5(a*3, b*3);
+; }
+; __device__ int foo7(int a, int b) {
+; return foo6(a*a + 2, b*b*5);
+; }
+; __device__ int foo8(int a, int b) {
+; return foo7(a*2, b*2*a);
+; }
+; __device__ int foo9(int a, int b) {
+; return foo8(a*2, b*2*a);
+; }
+; __device__ int foo10(int a, int b) {
+; return foo9(a*2*b, b*2);
+; }
+;
+; __device__ int g;
+;
+; __global__ void kernel(int a, int b) {
+; g = foo10(a, b);
+; }
+;
+; CHECK: .entry _Z6kernelii(
+; CHECK: .loc [[FILENUM:[1-9]]] 42
+; CHECK: .loc [[FILENUM]] 36 {{[0-9]*}}, function_name [[FOO10NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 42
+; CHECK: .loc [[FILENUM]] 33 {{[0-9]*}}, function_name [[FOO9NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 36
+; CHECK: .loc [[FILENUM]] 30 {{[0-9]*}}, function_name [[FOO8NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 33
+; CHECK: .loc [[FILENUM]] 27 {{[0-9]*}}, function_name [[FOO7NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 30
+; CHECK: .loc [[FILENUM]] 24 {{[0-9]*}}, function_name [[FOO6NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 27
+; CHECK: .loc [[FILENUM]] 21 {{[0-9]*}}, function_name [[FOO5NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 24
+; CHECK: .loc [[FILENUM]] 18 {{[0-9]*}}, function_name [[FOO4NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 21
+; CHECK: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[FOO3NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 18
+; CHECK: .loc [[FILENUM]] 12 {{[0-9]*}}, function_name [[FOO2NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 15
+; CHECK: .loc [[FILENUM]] 9 {{[0-9]*}}, function_name [[FOO1NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 12
+; CHECK: .loc [[FILENUM]] 2 {{[0-9]*}}, function_name [[FOO0NAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 9
+; CHECK: .loc [[FILENUM]] 3 {{[0-9]*}}, function_name [[FOO0NAME]], inlined_at [[FILENUM]] 9
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[FOO10NAME]]:
+; CHECK-NEXT: // {{.*}} _Z5foo10ii
+; CHECK: [[FOO9NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo9ii
+; CHECK: [[FOO8NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo8ii
+; CHECK: [[FOO7NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo7ii
+; CHECK: [[FOO6NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo6ii
+; CHECK: [[FOO5NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo5ii
+; CHECK: [[FOO4NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo4ii
+; CHECK: [[FOO3NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo3ii
+; CHECK: [[FOO2NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo2ii
+; CHECK: [[FOO1NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo1ii
+; CHECK: [[FOO0NAME]]:
+; CHECK-NEXT: // {{.*}} _Z4foo0ii
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at g = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelii, ptr addrspacecast (ptr addrspace(1) @g to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none)
+define void @_Z6kernelii(i32 noundef %a, i32 noundef %b) #0 !dbg !5 {
+entry:
+ %mul.i = shl nsw i32 %a, 1, !dbg !7
+ %mul2.i = mul nsw i32 %mul.i, %b, !dbg !7
+ %mul.i.i = shl nsw i32 %mul2.i, 1, !dbg !12
+ %mul.i.i.i = shl nsw i32 %mul2.i, 2, !dbg !16
+ %mul4.i.i = shl i32 %b, 3, !dbg !12
+ %mul2.i.i.i = mul i32 %mul4.i.i, %mul2.i, !dbg !16
+ %mul4.i.i.i = mul nsw i32 %mul2.i.i.i, %mul.i.i, !dbg !16
+ %mul.i.i.i.i = mul i32 %mul2.i, 36, !dbg !20
+ %0 = mul i32 %mul.i.i.i.i, %mul.i.i.i, !dbg !24
+ %mul.i.i.i.i.i.i = add nuw nsw i32 %0, 18, !dbg !24
+ %mul4.i.i.i.i = mul i32 %mul4.i.i.i, 135, !dbg !20
+ %mul3.i.i.i.i.i.i.i = mul i32 %mul4.i.i.i.i, %mul4.i.i.i, !dbg !31
+ %sub.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i, -7, !dbg !35
+ %mul.i.i.i.i.i.i.i.i.i = mul nsw i32 %sub.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i, !dbg !39
+ %mul.i.i.i.i.i.i.i = mul i32 %mul.i.i.i.i.i.i, 3, !dbg !31
+ %1 = mul i32 %mul.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i, !dbg !43
+ %mul.i.i.i.i.i.i.i.i.i.i = add i32 %1, 306, !dbg !43
+ %mul3.i.i.i.i.i.i.i.i.i.i = mul nuw nsw i32 %mul.i.i.i.i.i.i.i.i.i, %mul.i.i.i.i.i.i.i.i.i, !dbg !43
+ %cmp.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %mul3.i.i.i.i.i.i.i.i.i.i, !dbg !47
+ br i1 %cmp.i.i.i.i.i.i.i.i.i.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i.i, label %if.end.i.i.i.i.i.i.i.i.i.i.i, !dbg !47
+
+if.then.i.i.i.i.i.i.i.i.i.i.i: ; preds = %entry
+ %dec.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 305, !dbg !51
+ br label %_Z5foo10ii.exit, !dbg !51
+
+if.end.i.i.i.i.i.i.i.i.i.i.i: ; preds = %entry
+ %sub.i.i.i.i.i.i.i.i.i.i.i = add nsw i32 %mul3.i.i.i.i.i.i.i.i.i.i, -7, !dbg !53
+ %cmp5.i.i.i.i.i.i.i.i.i.i.i = icmp sgt i32 %mul.i.i.i.i.i.i.i.i.i.i, %sub.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+ br i1 %cmp5.i.i.i.i.i.i.i.i.i.i.i, label %if.then6.i.i.i.i.i.i.i.i.i.i.i, label %if.end8.i.i.i.i.i.i.i.i.i.i.i, !dbg !53
+
+if.then6.i.i.i.i.i.i.i.i.i.i.i: ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+ %mul.i.i.i.i.i.i.i.i.i.i.i = shl nsw i32 %mul.i.i.i.i.i.i.i.i.i.i, 1, !dbg !54
+ br label %_Z5foo10ii.exit, !dbg !54
+
+if.end8.i.i.i.i.i.i.i.i.i.i.i: ; preds = %if.end.i.i.i.i.i.i.i.i.i.i.i
+ %inc.i.i.i.i.i.i.i.i.i.i.i = add i32 %1, 307, !dbg !56
+ br label %_Z5foo10ii.exit, !dbg !56
+
+_Z5foo10ii.exit: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i
+ %retval.0.i.i.i.i.i.i.i.i.i.i.i = phi i32 [ %dec.i.i.i.i.i.i.i.i.i.i.i, %if.then.i.i.i.i.i.i.i.i.i.i.i ], [ %mul.i.i.i.i.i.i.i.i.i.i.i, %if.then6.i.i.i.i.i.i.i.i.i.i.i ], [ %inc.i.i.i.i.i.i.i.i.i.i.i, %if.end8.i.i.i.i.i.i.i.i.i.i.i ], !dbg !56
+ store i32 %retval.0.i.i.i.i.i.i.i.i.i.i.i, ptr addrspace(1) @g, align 4, !dbg !57
+ ret void, !dbg !58
+}
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!nvvm.annotations = !{!3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "t5.cu", directory: "")
+!2 = !{}
+!3 = !{ptr @_Z6kernelii, !"kernel", i32 1}
+!4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelii", scope: !1, file: !1, line: 41, type: !6, scopeLine: 41, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!8 = distinct !DILexicalBlock(scope: !9, file: !1, line: 35, column: 28)
+!9 = distinct !DISubprogram(name: "foo10", linkageName: "_Z5foo10ii", scope: !1, file: !1, line: 35, type: !6, scopeLine: 35, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!10 = distinct !DILocation(line: 42, column: 3, scope: !11)
+!11 = distinct !DILexicalBlock(scope: !5, file: !1, line: 41, column: 29)
+!12 = !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!13 = distinct !DILexicalBlock(scope: !14, file: !1, line: 32, column: 28)
+!14 = distinct !DISubprogram(name: "foo9", linkageName: "_Z4foo9ii", scope: !1, file: !1, line: 32, type: !6, scopeLine: 32, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!15 = distinct !DILocation(line: 36, column: 3, scope: !8, inlinedAt: !10)
+!16 = !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 29, column: 28)
+!18 = distinct !DISubprogram(name: "foo8", linkageName: "_Z4foo8ii", scope: !1, file: !1, line: 29, type: !6, scopeLine: 29, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!19 = distinct !DILocation(line: 33, column: 3, scope: !13, inlinedAt: !15)
+!20 = !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 26, column: 28)
+!22 = distinct !DISubprogram(name: "foo7", linkageName: "_Z4foo7ii", scope: !1, file: !1, line: 26, type: !6, scopeLine: 26, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!23 = distinct !DILocation(line: 30, column: 3, scope: !17, inlinedAt: !19)
+!24 = !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 20, column: 28)
+!26 = distinct !DISubprogram(name: "foo5", linkageName: "_Z4foo5ii", scope: !1, file: !1, line: 20, type: !6, scopeLine: 20, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 24, column: 3, scope: !28, inlinedAt: !30)
+!28 = distinct !DILexicalBlock(scope: !29, file: !1, line: 23, column: 28)
+!29 = distinct !DISubprogram(name: "foo6", linkageName: "_Z4foo6ii", scope: !1, file: !1, line: 23, type: !6, scopeLine: 23, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!30 = distinct !DILocation(line: 27, column: 3, scope: !21, inlinedAt: !23)
+!31 = !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!32 = distinct !DILexicalBlock(scope: !33, file: !1, line: 17, column: 28)
+!33 = distinct !DISubprogram(name: "foo4", linkageName: "_Z4foo4ii", scope: !1, file: !1, line: 17, type: !6, scopeLine: 17, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!34 = distinct !DILocation(line: 21, column: 3, scope: !25, inlinedAt: !27)
+!35 = !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!36 = distinct !DILexicalBlock(scope: !37, file: !1, line: 14, column: 28)
+!37 = distinct !DISubprogram(name: "foo3", linkageName: "_Z4foo3ii", scope: !1, file: !1, line: 14, type: !6, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!38 = distinct !DILocation(line: 18, column: 3, scope: !32, inlinedAt: !34)
+!39 = !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!40 = distinct !DILexicalBlock(scope: !41, file: !1, line: 11, column: 28)
+!41 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2ii", scope: !1, file: !1, line: 11, type: !6, scopeLine: 11, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!42 = distinct !DILocation(line: 15, column: 3, scope: !36, inlinedAt: !38)
+!43 = !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!44 = distinct !DILexicalBlock(scope: !45, file: !1, line: 8, column: 28)
+!45 = distinct !DISubprogram(name: "foo1", linkageName: "_Z4foo1ii", scope: !1, file: !1, line: 8, type: !6, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!46 = distinct !DILocation(line: 12, column: 3, scope: !40, inlinedAt: !42)
+!47 = !DILocation(line: 2, column: 3, scope: !48, inlinedAt: !50)
+!48 = distinct !DILexicalBlock(scope: !49, file: !1, line: 1, column: 28)
+!49 = distinct !DISubprogram(name: "foo0", linkageName: "_Z4foo0ii", scope: !1, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!50 = distinct !DILocation(line: 9, column: 3, scope: !44, inlinedAt: !46)
+!51 = !DILocation(line: 3, column: 5, scope: !52, inlinedAt: !50)
+!52 = distinct !DILexicalBlock(scope: !48, file: !1, line: 2, column: 3)
+!53 = !DILocation(line: 4, column: 3, scope: !48, inlinedAt: !50)
+!54 = !DILocation(line: 5, column: 5, scope: !55, inlinedAt: !50)
+!55 = distinct !DILexicalBlock(scope: !48, file: !1, line: 4, column: 3)
+!56 = !DILocation(line: 6, column: 3, scope: !48, inlinedAt: !50)
+!57 = !DILocation(line: 42, column: 3, scope: !11)
+!58 = !DILocation(line: 43, column: 1, scope: !11)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
new file mode 100644
index 0000000000000..8c5e48814c59b
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_6.ll
@@ -0,0 +1,343 @@
+; RUN: llc < %s -mattr=+ptx72 -O0 | FileCheck %s
+;
+;; Test same function inlined multiple times from different call sites - verifies that
+;; inlined_at information correctly distinguishes multiple inline instances of rrand() and znew() called from different locations within a loop.
+;
+; typedef unsigned long long MYSIZE_T; // avoid platform dependence
+; struct GridOpt {
+; int launch;
+; int sync;
+; int size;
+; MYSIZE_T itr;
+; };
+;
+; enum LaunchOpt {
+; LaunchOptPerThreadStream = 0,
+; LaunchOptBlockSharedStream,
+; LaunchOptAllNullStream,
+; LaunchOptForkBomb,
+; LaunchOptLoopLaunch,
+; LaunchOptRand,
+; LaunchOptSize,
+; };
+;
+; // post launch sync options
+; enum SyncOpt {
+; SyncOptAllThreadsSync = 0,
+; SyncOptOneSyncPerBlock,
+; SyncOptRand,
+; SyncOptSize,
+; };
+;
+; // size of launch options
+; enum SizeOpt {
+; SizeOptSingleWarp = 0,
+; SizeOptMultiWarp,
+; SizeOptMultiBlock,
+; SizeOptRand,
+; SizeOptSize,
+; };
+;
+;
+; // device side failure codes
+; enum ErrorStatus {
+; Success = 0,
+; DeviceRuntimeFailure = 1,
+; DeviceMallocFailure = 2,
+; DeviceHardwareFailure = 3,
+; InvalidInput = 4,
+; };
+;
+; __device__ MYSIZE_T znew( MYSIZE_T seed )
+; {
+; return 36969 * ( seed & 65535 ) + seed >> 16;
+; }
+;
+; __device__ MYSIZE_T rrand( MYSIZE_T *seed )
+; {
+; *seed = znew( *seed );
+; return *seed;
+; }
+;
+; __device__ GridOpt loopOpt;
+;
+; __global__ void cnpWideLaunch(GridOpt opt, MYSIZE_T maxLaunches, MYSIZE_T randomSeed, int *status, int *launchCounts)
+; {
+; // MYSIZE_T threadSeed = randomSeed + blockDim.x * blockIdx.x + threadIdx.x;
+; MYSIZE_T blockSeed = randomSeed + blockIdx.x;
+;
+; // this device launch consumes a launch slot
+; maxLaunches--;
+;
+; // compute number of launches per block
+; MYSIZE_T blockLaunches = maxLaunches / gridDim.x;
+; MYSIZE_T extraLaunches = maxLaunches - gridDim.x * blockLaunches;
+; if (blockIdx.x < extraLaunches) {
+; blockLaunches++;
+; }
+;
+; // clear launchcount with a thread in each block
+; if (threadIdx.x == 0)
+; launchCounts[blockIdx.x] = 0;
+;
+; // compute per block random selections for sync/launch size/stream
+; for (MYSIZE_T i = 0; i < opt.itr; i++) {
+; loopOpt = opt;
+; if (opt.launch == LaunchOptRand) {
+; loopOpt.launch = rrand(&blockSeed) % (LaunchOptSize - 1);
+; }
+; if (opt.sync == SyncOptRand) {
+; loopOpt.sync = rrand(&blockSeed) % (SyncOptSize - 1);
+; }
+; if (opt.size == SizeOptRand) {
+; loopOpt.size = rrand(&blockSeed) % (SizeOptSize - 1);
+; }
+; __syncthreads();
+; if (threadIdx.x == 0) {
+; //printf("block %d launchCount %d blockLaunches:%d\n", blockIdx.x, launchCounts[blockIdx.x], blockLaunches);
+; int launchCount = launchCounts[blockIdx.x];
+;
+; // fail if block did not generate enough launches
+; if (!*status && (launchCount != blockLaunches)) {
+; *status = DeviceHardwareFailure;
+; }
+; // clear last iteration's launch
+; launchCounts[blockIdx.x] = 0;
+; }
+; __syncthreads();
+;
+; // fail if a global error is present
+; if (*status) {
+; return;
+; }
+; }
+; }
+;
+; CHECK: .loc [[FILENUM:[1-9]]] 84
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 84
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 87
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 87
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .loc [[FILENUM:[1-9]]] 90
+; CHECK: .loc [[FILENUM]] 55 {{[0-9]*}}, function_name [[RRANDNAME]], inlined_at [[FILENUM]] 90
+; CHECK: .loc [[FILENUM]] 50 {{[0-9]*}}, function_name [[ZNEWNAME]], inlined_at [[FILENUM]] 55
+; CHECK: .section .debug_str
+; CHECK: {
+; CHECK: [[RRANDNAME]]:
+; CHECK-NEXT: // {{.*}} _Z5rrandPy
+; CHECK: [[ZNEWNAME]]:
+; CHECK-NEXT: // {{.*}} _Z4znewy
+; CHECK: }
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.GridOpt = type { i32, i32, i32, i64 }
+
+ at loopOpt = internal addrspace(1) global %struct.GridOpt zeroinitializer, align 8
+ at llvm.used = appending global [2 x ptr] [ptr @_Z13cnpWideLaunch7GridOptyyPiS0_, ptr addrspacecast (ptr addrspace(1) @loopOpt to ptr)], section "llvm.metadata"
+
+; Function Attrs: alwaysinline convergent mustprogress norecurse nounwind willreturn
+define void @_Z13cnpWideLaunch7GridOptyyPiS0_(%struct.GridOpt noundef %opt, i64 noundef %maxLaunches, i64 noundef %randomSeed, ptr noundef captures(none) %status, ptr noundef captures(none) %launchCounts) #0 !dbg !4 {
+entry:
+ %0 = addrspacecast ptr %status to ptr addrspace(1)
+ %1 = addrspacecast ptr %launchCounts to ptr addrspace(1)
+ %opt.fca.0.extract = extractvalue %struct.GridOpt %opt, 0
+ %opt.fca.1.extract = extractvalue %struct.GridOpt %opt, 1
+ %opt.fca.2.extract = extractvalue %struct.GridOpt %opt, 2
+ %opt.fca.3.extract = extractvalue %struct.GridOpt %opt, 3
+ %2 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !dbg !6
+ %conv = zext nneg i32 %2 to i64, !dbg !6
+ %add = add i64 %randomSeed, %conv, !dbg !6
+ %dec = add i64 %maxLaunches, -1, !dbg !8
+ %3 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !dbg !9
+ %conv6 = zext nneg i32 %3 to i64, !dbg !9
+ %div = udiv i64 %dec, %conv6, !dbg !9
+ %mul = mul i64 %div, %conv6, !dbg !10
+ %sub = sub i64 %dec, %mul, !dbg !10
+ %cmp = icmp ugt i64 %sub, %conv, !dbg !11
+ %inc = zext i1 %cmp to i64, !dbg !11
+ %spec.select = add i64 %div, %inc, !dbg !11
+ %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !dbg !12
+ %cmp20 = icmp eq i32 %4, 0, !dbg !12
+ br i1 %cmp20, label %if.then22, label %if.end26, !dbg !12
+
+if.then22: ; preds = %entry
+ %getElem = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv, !dbg !13
+ store i32 0, ptr addrspace(1) %getElem, align 4, !dbg !13
+ br label %if.end26, !dbg !13
+
+if.end26: ; preds = %if.then22, %entry
+ %cmp3021.not = icmp eq i64 %opt.fca.3.extract, 0, !dbg !15
+ br i1 %cmp3021.not, label %return, label %for.body.preheader, !dbg !15
+
+for.body.preheader: ; preds = %if.end26
+ %cmp34 = icmp eq i32 %opt.fca.0.extract, 5
+ %cmp41 = icmp eq i32 %opt.fca.1.extract, 2
+ %cmp50 = icmp eq i32 %opt.fca.2.extract, 3
+ %getElem11 = getelementptr inbounds nuw i32, ptr addrspace(1) %1, i64 %conv
+ br label %for.body, !dbg !16
+
+for.body: ; preds = %for.body.preheader, %if.end83
+ %i.023 = phi i64 [ %inc90, %if.end83 ], [ 0, %for.body.preheader ]
+ %blockSeed.022 = phi i64 [ %blockSeed.3, %if.end83 ], [ %add, %for.body.preheader ]
+ %opt.elt = extractvalue %struct.GridOpt %opt, 0, !dbg !20
+ %opt.elt25 = extractvalue %struct.GridOpt %opt, 1, !dbg !20
+ %5 = insertelement <2 x i32> poison, i32 %opt.elt, i64 0, !dbg !20
+ %6 = insertelement <2 x i32> %5, i32 %opt.elt25, i64 1, !dbg !20
+ store <2 x i32> %6, ptr addrspace(1) @loopOpt, align 8, !dbg !20
+ %opt.elt26 = extractvalue %struct.GridOpt %opt, 2, !dbg !20
+ store i32 %opt.elt26, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !20
+ %opt.elt27 = extractvalue %struct.GridOpt %opt, 3, !dbg !20
+ store i64 %opt.elt27, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 16), align 8, !dbg !20
+ br i1 %cmp34, label %if.then36, label %if.end38, !dbg !16
+
+if.then36: ; preds = %for.body
+ %and.i = and i64 %blockSeed.022, 65535, !dbg !21
+ %mul.i = mul nuw nsw i64 %and.i, 36969, !dbg !21
+ %add.i = add i64 %mul.i, %blockSeed.022, !dbg !21
+ %shr.i = lshr i64 %add.i, 16, !dbg !21
+ %rem = urem i64 %shr.i, 5, !dbg !29
+ %conv37 = trunc nuw nsw i64 %rem to i32, !dbg !29
+ store i32 %conv37, ptr addrspace(1) @loopOpt, align 8, !dbg !29
+ br label %if.end38, !dbg !29
+
+if.end38: ; preds = %if.then36, %for.body
+ %blockSeed.1 = phi i64 [ %shr.i, %if.then36 ], [ %blockSeed.022, %for.body ], !dbg !30
+ br i1 %cmp41, label %if.then43, label %if.end47, !dbg !30
+
+if.then43: ; preds = %if.end38
+ %and.i13 = and i64 %blockSeed.1, 65535, !dbg !31
+ %mul.i14 = mul nuw nsw i64 %and.i13, 36969, !dbg !31
+ %add.i15 = add i64 %mul.i14, %blockSeed.1, !dbg !31
+ %shr.i16 = lshr i64 %add.i15, 16, !dbg !31
+ %7 = trunc i64 %shr.i16 to i32, !dbg !35
+ %conv46 = and i32 %7, 1, !dbg !35
+ store i32 %conv46, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 4), align 4, !dbg !35
+ br label %if.end47, !dbg !35
+
+if.end47: ; preds = %if.then43, %if.end38
+ %blockSeed.2 = phi i64 [ %shr.i16, %if.then43 ], [ %blockSeed.1, %if.end38 ], !dbg !36
+ br i1 %cmp50, label %if.then52, label %if.end56, !dbg !36
+
+if.then52: ; preds = %if.end47
+ %and.i17 = and i64 %blockSeed.2, 65535, !dbg !37
+ %mul.i18 = mul nuw nsw i64 %and.i17, 36969, !dbg !37
+ %add.i19 = add i64 %mul.i18, %blockSeed.2, !dbg !37
+ %shr.i20 = lshr i64 %add.i19, 16, !dbg !37
+ %rem54 = urem i64 %shr.i20, 3, !dbg !41
+ %conv55 = trunc nuw nsw i64 %rem54 to i32, !dbg !41
+ store i32 %conv55, ptr addrspace(1) getelementptr inbounds nuw (i8, ptr addrspace(1) @loopOpt, i64 8), align 8, !dbg !41
+ br label %if.end56, !dbg !41
+
+if.end56: ; preds = %if.then52, %if.end47
+ %blockSeed.3 = phi i64 [ %shr.i20, %if.then52 ], [ %blockSeed.2, %if.end47 ], !dbg !42
+ tail call void @llvm.nvvm.barrier0(), !dbg !42
+ br i1 %cmp20, label %if.then61, label %if.end83, !dbg !43
+
+if.then61: ; preds = %if.end56
+ %tmp67 = load i32, ptr addrspace(1) %getElem11, align 4, !dbg !44
+ %tmp69 = load i32, ptr addrspace(1) %0, align 4, !dbg !46
+ %tobool.not = icmp eq i32 %tmp69, 0, !dbg !46
+ %conv71 = sext i32 %tmp67 to i64, !dbg !46
+ %cmp73 = icmp ne i64 %spec.select, %conv71, !dbg !46
+ %or.cond = select i1 %tobool.not, i1 %cmp73, i1 false, !dbg !46
+ br i1 %or.cond, label %if.then75, label %if.end77, !dbg !46
+
+if.then75: ; preds = %if.then61
+ store i32 3, ptr addrspace(1) %0, align 4, !dbg !47
+ br label %if.end77, !dbg !47
+
+if.end77: ; preds = %if.then61, %if.then75
+ store i32 0, ptr addrspace(1) %getElem11, align 4, !dbg !49
+ br label %if.end83, !dbg !49
+
+if.end83: ; preds = %if.end77, %if.end56
+ tail call void @llvm.nvvm.barrier0(), !dbg !50
+ %tmp85 = load i32, ptr addrspace(1) %0, align 4, !dbg !51
+ %tobool86.not = icmp eq i32 %tmp85, 0, !dbg !51
+ %inc90 = add nuw i64 %i.023, 1
+ %cmp30 = icmp ult i64 %inc90, %opt.fca.3.extract
+ %or.cond24 = select i1 %tobool86.not, i1 %cmp30, i1 false, !dbg !51
+ br i1 %or.cond24, label %for.body, label %return, !dbg !51, !llvm.loop !52
+
+return: ; preds = %if.end83, %if.end26
+ ret void, !dbg !54
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
+
+; Function Attrs: convergent nocallback nounwind
+declare void @llvm.nvvm.barrier0() #2
+
+attributes #0 = { alwaysinline convergent mustprogress norecurse nounwind willreturn "target-cpu"="sm_75" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t6.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "cnpWideLaunch", linkageName: "_Z13cnpWideLaunch7GridOptyyPiS0_", scope: !1, file: !1, line: 61, type: !5, scopeLine: 61, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 64, column: 5, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 62, column: 1)
+!8 = !DILocation(line: 67, column: 5, scope: !7)
+!9 = !DILocation(line: 70, column: 5, scope: !7)
+!10 = !DILocation(line: 71, column: 5, scope: !7)
+!11 = !DILocation(line: 72, column: 5, scope: !7)
+!12 = !DILocation(line: 77, column: 5, scope: !7)
+!13 = !DILocation(line: 78, column: 9, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !7, file: !1, line: 77, column: 5)
+!15 = !DILocation(line: 81, column: 5, scope: !7)
+!16 = !DILocation(line: 83, column: 9, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !18, file: !1, line: 81, column: 5)
+!18 = distinct !DILexicalBlock(scope: !19, file: !1, line: 81, column: 5)
+!19 = distinct !DILexicalBlock(scope: !7, file: !1, line: 81, column: 5)
+!20 = !DILocation(line: 82, column: 9, scope: !17)
+!21 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !24)
+!22 = distinct !DILexicalBlock(scope: !23, file: !1, line: 49, column: 1)
+!23 = distinct !DISubprogram(name: "znew", linkageName: "_Z4znewy", scope: !1, file: !1, line: 48, type: !5, scopeLine: 48, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!24 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !27)
+!25 = distinct !DILexicalBlock(scope: !26, file: !1, line: 54, column: 1)
+!26 = distinct !DISubprogram(name: "rrand", linkageName: "_Z5rrandPy", scope: !1, file: !1, line: 53, type: !5, scopeLine: 53, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!27 = distinct !DILocation(line: 84, column: 13, scope: !28)
+!28 = distinct !DILexicalBlock(scope: !17, file: !1, line: 83, column: 9)
+!29 = !DILocation(line: 84, column: 13, scope: !28)
+!30 = !DILocation(line: 86, column: 9, scope: !17)
+!31 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !32)
+!32 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !33)
+!33 = distinct !DILocation(line: 87, column: 13, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !17, file: !1, line: 86, column: 9)
+!35 = !DILocation(line: 87, column: 13, scope: !34)
+!36 = !DILocation(line: 89, column: 9, scope: !17)
+!37 = !DILocation(line: 50, column: 5, scope: !22, inlinedAt: !38)
+!38 = distinct !DILocation(line: 55, column: 5, scope: !25, inlinedAt: !39)
+!39 = distinct !DILocation(line: 90, column: 13, scope: !40)
+!40 = distinct !DILexicalBlock(scope: !17, file: !1, line: 89, column: 9)
+!41 = !DILocation(line: 90, column: 13, scope: !40)
+!42 = !DILocation(line: 92, column: 9, scope: !17)
+!43 = !DILocation(line: 93, column: 9, scope: !17)
+!44 = !DILocation(line: 95, column: 13, scope: !45)
+!45 = distinct !DILexicalBlock(scope: !17, file: !1, line: 93, column: 9)
+!46 = !DILocation(line: 98, column: 13, scope: !45)
+!47 = !DILocation(line: 99, column: 17, scope: !48)
+!48 = distinct !DILexicalBlock(scope: !45, file: !1, line: 98, column: 13)
+!49 = !DILocation(line: 102, column: 13, scope: !45)
+!50 = !DILocation(line: 104, column: 9, scope: !17)
+!51 = !DILocation(line: 107, column: 9, scope: !17)
+!52 = distinct !{!52, !53}
+!53 = !{!"llvm.loop.mustprogress"}
+!54 = !DILocation(line: 111, column: 1, scope: !7)
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
new file mode 100644
index 0000000000000..9a76fe7ac4a1f
--- /dev/null
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_7.ll
@@ -0,0 +1,118 @@
+; RUN: llc < %s -mattr=+ptx72 | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc < %s -mattr=+ptx70 | FileCheck %s --check-prefix=PTXVERSION
+; RUN: llc < %s -mattr=+ptx72 --line-info-inlined-at=false | FileCheck %s --check-prefix=NOFLAG
+;
+;; Test command-line flags to control inlined_at emission - verifies that PTX 7.2+ emits
+;; function_name and inlined_at by default, while PTX 7.0 or --line-info-inlined-at=false disables it.
+;
+; #include <stdio.h>
+;
+; __device__ int gg;
+;
+; __device__ void foo();
+; __device__ void bar();
+; extern __device__ void calculate();
+; __device__ void foo() {
+; if (gg > 7)
+; bar();
+; calculate();
+; }
+;
+; __device__ void bar() {
+; if (gg > 17)
+; foo();
+; calculate();
+; }
+;
+; __global__ void kernel() {
+; foo();
+; }
+;
+; DEFAULT: .loc [[FILENUM:[1-9]]] 10
+; DEFAULT: .loc [[FILENUM]] 15 {{[0-9]*}}, function_name [[BARNAME:\$L__info_string[0-9]+]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 16 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .loc [[FILENUM]] 17 {{[0-9]*}}, function_name [[BARNAME]], inlined_at [[FILENUM]] 10
+; DEFAULT: .section .debug_str
+; DEFAULT: {
+; DEFAULT: [[BARNAME]]:
+; DEFAULT-NEXT: // {{.*}} _Z3barv
+; DEFAULT: }
+
+; NOFLAG-NOT: function_name
+; NOFLAG-NOT: inlined_at {{[1-9]}}
+; NOFLAG-NOT: .section .debug_str
+
+; PTXVERSION-NOT: function_name
+; PTXVERSION-NOT: inlined_at {{[1-9]}}
+; PTXVERSION-NOT: .section .debug_str
+
+source_filename = "<unnamed>"
+target datalayout = "e-p:64:64:64-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-f128:128:128-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64-a:8:8-p6:32:32"
+target triple = "nvptx64-nvidia-cuda"
+
+ at gg = internal addrspace(1) global i32 0, align 4
+ at llvm.used = appending global [2 x ptr] [ptr @_Z6kernelv, ptr addrspacecast (ptr addrspace(1) @gg to ptr)], section "llvm.metadata"
+
+; Function Attrs: mustprogress willreturn
+define internal fastcc void @_Z3foov() unnamed_addr #0 !dbg !4 {
+entry:
+ %tmp = load i32, ptr addrspace(1) @gg, align 4, !dbg !6
+ %cmp = icmp sgt i32 %tmp, 7, !dbg !6
+ br i1 %cmp, label %if.then, label %if.end, !dbg !6
+
+if.then: ; preds = %entry
+ %cmp.i = icmp samesign ugt i32 %tmp, 17, !dbg !8
+ br i1 %cmp.i, label %if.then.i, label %_Z3barv.exit, !dbg !8
+
+if.then.i: ; preds = %if.then
+ tail call fastcc void @_Z3foov(), !dbg !13
+ br label %_Z3barv.exit, !dbg !13
+
+_Z3barv.exit: ; preds = %if.then, %if.then.i
+ tail call void @_Z9calculatev(), !dbg !15
+ br label %if.end, !dbg !16
+
+if.end: ; preds = %_Z3barv.exit, %entry
+ tail call void @_Z9calculatev(), !dbg !17
+ ret void, !dbg !18
+}
+
+declare void @_Z9calculatev() local_unnamed_addr #1
+
+; Function Attrs: alwaysinline mustprogress willreturn
+define void @_Z6kernelv() #2 !dbg !19 {
+entry:
+ tail call fastcc void @_Z3foov(), !dbg !20
+ ret void, !dbg !22
+}
+
+attributes #0 = { mustprogress willreturn "target-cpu"="sm_75" }
+attributes #1 = { "target-cpu"="sm_75" }
+attributes #2 = { alwaysinline mustprogress willreturn "target-cpu"="sm_75" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
+!1 = !DIFile(filename: "t7.cu", directory: "")
+!2 = !{}
+!3 = !{i32 1, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 8, type: !5, scopeLine: 8, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(types: !2)
+!6 = !DILocation(line: 9, column: 3, scope: !7)
+!7 = distinct !DILexicalBlock(scope: !4, file: !1, line: 8, column: 29)
+!8 = !DILocation(line: 15, column: 3, scope: !9, inlinedAt: !11)
+!9 = distinct !DILexicalBlock(scope: !10, file: !1, line: 14, column: 29)
+!10 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 14, type: !5, scopeLine: 14, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!11 = distinct !DILocation(line: 10, column: 5, scope: !12)
+!12 = distinct !DILexicalBlock(scope: !7, file: !1, line: 9, column: 3)
+!13 = !DILocation(line: 16, column: 5, scope: !14, inlinedAt: !11)
+!14 = distinct !DILexicalBlock(scope: !9, file: !1, line: 15, column: 3)
+!15 = !DILocation(line: 17, column: 3, scope: !9, inlinedAt: !11)
+!16 = !DILocation(line: 10, column: 5, scope: !12)
+!17 = !DILocation(line: 11, column: 3, scope: !7)
+!18 = !DILocation(line: 12, column: 1, scope: !7)
+!19 = distinct !DISubprogram(name: "kernel", linkageName: "_Z6kernelv", scope: !1, file: !1, line: 20, type: !5, scopeLine: 20, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!20 = !DILocation(line: 21, column: 3, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !19, file: !1, line: 20, column: 29)
+!22 = !DILocation(line: 22, column: 1, scope: !21)
>From ca957612c02fe3d972c8bbf6b8a0564ce5f21614 Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Fri, 5 Dec 2025 13:11:54 -0800
Subject: [PATCH 2/3] changed hook function names and few comments
---
llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 15 +++++++--------
llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h | 12 ++++++------
llvm/lib/Target/NVPTX/CMakeLists.txt | 2 +-
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp | 9 +++++----
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h | 10 +++++-----
5 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 3fd60d70cfb68..29c09fc526ce1 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2204,18 +2204,17 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
Flags |= DWARF2_FLAG_IS_STMT;
}
- // Call the hook that allows targets to customize source line recording
- recordSourceLineHook(*MI, DL, Flags);
+ // Call target-specific source line recording.
+ recordTargetSourceLine(*MI, DL, Flags);
// If we're not at line 0, remember this location.
if (DL.getLine())
PrevInstLoc = DL;
}
-// Default implementation of target-specific hook for custom source line
-// recording
-void DwarfDebug::recordSourceLineHook(const MachineInstr &MI,
- const DebugLoc &DL, unsigned Flags) {
+// Default implementation of target-specific source line recording.
+void DwarfDebug::recordTargetSourceLine(const MachineInstr &MI,
+ const DebugLoc &DL, unsigned Flags) {
SmallString<128> LocationString;
if (Asm->OutStreamer->isVerboseAsm()) {
raw_svector_ostream OS(LocationString);
@@ -2719,8 +2718,8 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
Asm->OutStreamer->getContext().setDwarfCompileUnitID(
getDwarfCompileUnitIDForLineTable(CU));
- // Call target-specific hook for custom initialization
- beginFunctionHook(*MF);
+ // Call target-specific debug info initialization.
+ initializeTargetDebugInfo(*MF);
// Record beginning of function.
PrologEndLoc = emitInitialLocDirective(
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 58aeb09645984..25d233025652d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -717,13 +717,13 @@ class DwarfDebug : public DebugHandlerBase {
void skippedNonDebugFunction() override;
- /// Target-specific hook for custom initialization,
- /// default implementation is empty, only being used for NVPTX target
- virtual void beginFunctionHook(const MachineFunction &MF) {}
+ /// Target-specific debug info initialization at function start.
+ /// Default implementation is empty, overridden by NVPTX target.
+ virtual void initializeTargetDebugInfo(const MachineFunction &MF) {}
- /// Target-specific hook for custom source line recording
- virtual void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
- unsigned Flags);
+ /// Target-specific source line recording.
+ virtual void recordTargetSourceLine(const MachineInstr &MI,
+ const DebugLoc &DL, unsigned Flags);
const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
return InfoHolder.getUnits();
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index 505e7e945f108..469aaf7685d09 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,13 +18,13 @@ set(NVPTXCodeGen_sources
NVPTXAssignValidGlobalNames.cpp
NVPTXAtomicLower.cpp
NVPTXCtorDtorLowering.cpp
+ NVPTXIRPeephole.cpp
NVPTXDwarfDebug.cpp
NVPTXForwardParams.cpp
NVPTXFrameLowering.cpp
NVPTXGenericToNVVM.cpp
NVPTXImageOptimizer.cpp
NVPTXInstrInfo.cpp
- NVPTXIRPeephole.cpp
NVPTXISelDAGToDAG.cpp
NVPTXISelLowering.cpp
NVPTXLowerAggrCopies.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
index 57410179a6344..d2f95ce28e783 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -151,15 +151,16 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
}
}
-// NVPTX-specific function initialization hook.
-void NVPTXDwarfDebug::beginFunctionHook(const MachineFunction &MF) {
+// NVPTX-specific debug info initialization.
+void NVPTXDwarfDebug::initializeTargetDebugInfo(const MachineFunction &MF) {
InlinedAtLocs.clear();
collectInlinedAtLocations(MF);
}
// NVPTX-specific source line recording with inlined_at support.
-void NVPTXDwarfDebug::recordSourceLineHook(const MachineInstr &MI,
- const DebugLoc &DL, unsigned Flags) {
+void NVPTXDwarfDebug::recordTargetSourceLine(const MachineInstr &MI,
+ const DebugLoc &DL,
+ unsigned Flags) {
// Call NVPTX-specific implementation that handles inlined_at.
recordSourceLineAndInlinedAt(MI, Flags);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
index d7032725134e6..9794d613ad9ba 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -36,11 +36,11 @@ class NVPTXDwarfDebug : public DwarfDebug {
void collectInlinedAtLocations(const MachineFunction &MF);
protected:
- /// Override hook to collect inlined_at locations.
- void beginFunctionHook(const MachineFunction &MF) override;
- /// Override hook to record source line information with inlined_at support.
- void recordSourceLineHook(const MachineInstr &MI, const DebugLoc &DL,
- unsigned Flags) override;
+ /// Override to collect inlined_at locations.
+ void initializeTargetDebugInfo(const MachineFunction &MF) override;
+ /// Override to record source line information with inlined_at support.
+ void recordTargetSourceLine(const MachineInstr &MI, const DebugLoc &DL,
+ unsigned Flags) override;
private:
/// NVPTX-specific source line recording with inlined_at support.
>From ee6758a3e2bc794f43965e5879a80ac488fce75e Mon Sep 17 00:00:00 2001
From: Laxman Sole <lsole at nvidia.com>
Date: Thu, 11 Dec 2025 18:46:13 -0800
Subject: [PATCH 3/3] Code refactor to remove scanning pass and track only the
emitted locs
---
llvm/include/llvm/MC/MCStreamer.h | 2 +-
llvm/lib/MC/MCAsmStreamer.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp | 51 +++++------------------
llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h | 8 ++--
llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll | 4 --
5 files changed, 16 insertions(+), 51 deletions(-)
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 6fe858c3201eb..6f1b3d5ccb2cd 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -902,7 +902,7 @@ class LLVM_ABI MCStreamer {
StringRef FileName,
StringRef Comment = {});
- /// This is same as emitDwarfLocDirective, except has capability to
+ /// This is same as emitDwarfLocDirective, except it has the capability to
/// add inlined_at information.
virtual void emitDwarfLocDirectiveWithInlinedAt(
unsigned FileNo, unsigned Line, unsigned Column, unsigned FileIA,
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index bdfaff0a22417..18a61ea7c9e00 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -1739,7 +1739,7 @@ void MCAsmStreamer::emitDwarfLocDirectiveSuffix(unsigned FileNo, unsigned Line,
OS << Comment;
}
- // Emit end of line and update parent state
+ // Emit end of line and update the baseclass state
EmitEOL();
MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
Discriminator, FileName, Comment);
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
index d2f95ce28e783..f5a674415cb28 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.cpp
@@ -35,36 +35,6 @@ static cl::opt<bool> LineInfoWithInlinedAt(
NVPTXDwarfDebug::NVPTXDwarfDebug(AsmPrinter *A) : DwarfDebug(A) {}
-// Collect all inlined_at locations for the current function.
-void NVPTXDwarfDebug::collectInlinedAtLocations(const MachineFunction &MF) {
- const DISubprogram *SP = MF.getFunction().getSubprogram();
- assert(SP && "expecting valid subprogram here");
-
- // inlined_at support requires PTX 7.2 or later.
- const NVPTXSubtarget &STI = MF.getSubtarget<NVPTXSubtarget>();
- if (STI.getPTXVersion() < 72)
- return;
-
- if (!(SP->getUnit()->isDebugDirectivesOnly() ||
- SP->getUnit()->getEmissionKind() == DICompileUnit::LineTablesOnly) ||
- !LineInfoWithInlinedAt) // No enhanced lineinfo, we are done.
- return;
-
- for (const MachineBasicBlock &MBB : MF) {
- for (const MachineInstr &MI : MBB) {
- const DebugLoc &DL = MI.getDebugLoc();
- if (!DL)
- continue;
- const DILocation *InlinedAt = DL.getInlinedAt();
- while (InlinedAt) {
- if (!InlinedAtLocs.insert(InlinedAt).second)
- break;
- InlinedAt = InlinedAt->getInlinedAt();
- }
- }
- }
-}
-
// NVPTX-specific source line recording with inlined_at support.
void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
unsigned Flags) {
@@ -73,7 +43,7 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
// inlined_at directive, we might need to emit additional .loc prior
// to it for the location contained in the inlined_at.
SmallVector<const DILocation *, 8> WorkList;
- DenseSet<const DILocation *> WorkListSet;
+ SmallDenseSet<const DILocation *, 8> WorkListSet;
const DILocation *EmitLoc = DL.get();
const DISubprogram *SP = MI.getMF()->getFunction().getSubprogram();
@@ -101,9 +71,10 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
break;
const DILocation *IA = EmitLoc->getInlinedAt();
- // Check if this has inlined_at information, and if we have not yet
- // emitted the .loc for the inlined_at location.
- if (IA && InlinedAtLocs.contains(IA))
+ // Check if this has inlined_at information, and if the parent location
+ // has not yet been emitted. If already emitted, we don't need to
+ // re-emit the parent chain.
+ if (IA && !EmittedInlinedAtLocs.contains(IA))
EmitLoc = IA;
else // We are done
break;
@@ -129,10 +100,6 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
const unsigned FileNo = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
.getOrCreateSourceID(Scope->getFile());
- // Remove this location from the work list if it is in the inlined_at
- // locations set.
- if (EnhancedLineinfo && InlinedAtLocs.contains(Current))
- InlinedAtLocs.erase(Current);
if (EnhancedLineinfo && InlinedAt) {
const unsigned FileIA = static_cast<DwarfCompileUnit &>(*getUnits()[CUID])
@@ -148,13 +115,17 @@ void NVPTXDwarfDebug::recordSourceLineAndInlinedAt(const MachineInstr &MI,
Asm->OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
Discriminator, Fn);
}
+ // Mark this location as emitted so we don't re-emit the parent chain
+ // for subsequent instructions that share the same inlined_at parent.
+ if (EnhancedLineinfo)
+ EmittedInlinedAtLocs.insert(Current);
}
}
// NVPTX-specific debug info initialization.
void NVPTXDwarfDebug::initializeTargetDebugInfo(const MachineFunction &MF) {
- InlinedAtLocs.clear();
- collectInlinedAtLocations(MF);
+ // Clear the set of emitted inlined_at locations for each new function.
+ EmittedInlinedAtLocs.clear();
}
// NVPTX-specific source line recording with inlined_at support.
diff --git a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
index 9794d613ad9ba..b7cfd5faa36f3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
+++ b/llvm/lib/Target/NVPTX/NVPTXDwarfDebug.h
@@ -25,16 +25,14 @@ namespace llvm {
/// inlined_at support.
class NVPTXDwarfDebug : public DwarfDebug {
private:
- /// Set of InlinedAt locations, used to track if these have been emitted.
- DenseSet<const DILocation *> InlinedAtLocs;
+ /// Set of inlined_at locations that have already been emitted.
+ /// Used to avoid redundant emission of parent chain .loc directives.
+ DenseSet<const DILocation *> EmittedInlinedAtLocs;
public:
/// Constructor - Pass through to DwarfDebug constructor.
NVPTXDwarfDebug(AsmPrinter *A);
- /// Collect all inlined_at locations for the current function.
- void collectInlinedAtLocations(const MachineFunction &MF);
-
protected:
/// Override to collect inlined_at locations.
void initializeTargetDebugInfo(const MachineFunction &MF) override;
diff --git a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
index 57df39e8034c4..31a17dc810b4a 100644
--- a/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
+++ b/llvm/test/DebugInfo/NVPTX/inlinedAt_4.ll
@@ -165,15 +165,11 @@ entry:
attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) "target-cpu"="sm_75" }
!llvm.dbg.cu = !{!0}
-!nvvmir.version = !{!2, !3, !4}
!nvvm.annotations = !{!5, !6, !7, !8}
!llvm.module.flags = !{!9}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly)
!1 = !DIFile(filename: "t4.cu", directory: "")
-!2 = !{i32 2, i32 0, i32 3, i32 2}
-!3 = !{i32 2, i32 0}
-!4 = !{i32 2, i32 0, i32 3, i32 1}
!5 = !{ptr @_Z7kernel1ii, !"kernel", i32 1}
!6 = !{ptr @_Z7kernel2ii, !"kernel", i32 1}
!7 = !{ptr @_Z7kernel3ii, !"kernel", i32 1}
More information about the llvm-commits
mailing list