[llvm] [DAGCombiner] Preserve debug location of original load in fold (conv (load x)) (PR #160236)
    via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Tue Sep 23 04:16:01 PDT 2025
    
    
  
https://github.com/jwu10003 updated https://github.com/llvm/llvm-project/pull/160236
>From 8ce1b4a005482a1a395ea9b2f5ebe6d7c067be95 Mon Sep 17 00:00:00 2001
From: "jian.wu" <jian.wu at amd.com>
Date: Tue, 23 Sep 2025 12:07:14 +0800
Subject: [PATCH 1/3] [DAGCombiner] Preserve debug location of original load in
 fold (conv (load x))
This patch fixes a debug information loss issue during the combine of a conversion (e.g., bitcast)
with a load into a new load: `fold (conv (load x)) -> (load (conv*)x)`.
The newly created load node was incorrectly using the debug location (`SDLoc`) of the conversion
operation (the `conv` node, `N`) instead of the location of the original load operation (the `load`
node, `LN0`). The location of the conversion operation often points to compiler-internal
instructions and provides little value for source-level debugging. In contrast, the original load's
location accurately represents the source of the data access in the user's code.
This change ensures the new load inherits the debug location from `LN0` by using `SDLoc(LN0)`,
which improves debugging experience and fixes a test case failure observed in the Triton compiler.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a6ba6e518899f..4cb0a35aa7b25 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16703,7 +16703,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
         }
       }
       SDValue Load =
-          DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+          DAG.getLoad(VT, SDLoc(LN0), LN0->getChain(), LN0->getBasePtr(),
                       LN0->getMemOperand());
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
diff --git a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
new file mode 100644
index 0000000000000..900c973b712ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942  < %s | FileCheck %s
+
+; CHECK-LABEL:  test:
+; CHECK:        .loc    1 8 16                          ; test.py:8:16
+; CHECK-NEXT:   s_load_dword
+
+; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define amdgpu_kernel void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1, ptr addrspace(1) inreg readnone captures(none) %2, ptr addrspace(1) inreg readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 {
+  %5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
+  %6 = and i32 %5, 255, !dbg !7
+  %7 = icmp eq i32 %6, 0, !dbg !7
+  br i1 %7, label %8, label %10, !dbg !7
+
+8:                                                ; preds = %4
+  %9 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6
+  store <1 x float> %9, ptr addrspace(1) %1, align 4, !dbg !7
+  br label %10, !dbg !7
+
+10:                                               ; preds = %8, %4
+  ret void, !dbg !9
+}
+
+; Function Attrs: alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" }
+attributes #1 = { alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "test.py", directory: "/path")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 9, column: 20, scope: !4)
+!8 = !DILocation(line: 8, column: 16, scope: !4)
+!9 = !DILocation(line: 9, column: 4, scope: !4)
>From 39959198ad713668629ccc379ef3b3486b1fa421 Mon Sep 17 00:00:00 2001
From: "jian.wu" <jian.wu at amd.com>
Date: Tue, 23 Sep 2025 17:44:25 +0800
Subject: [PATCH 2/3] update test case
---
 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll | 41 -------------------
 .../DebugInfo/AMDGPU/combine-conv-load.ll     | 26 ++++++++++++
 2 files changed, 26 insertions(+), 41 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
 create mode 100644 llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
diff --git a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll b/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
deleted file mode 100644
index 900c973b712ae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/combine-conv-load.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942  < %s | FileCheck %s
-
-; CHECK-LABEL:  test:
-; CHECK:        .loc    1 8 16                          ; test.py:8:16
-; CHECK-NEXT:   s_load_dword
-
-; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
-define amdgpu_kernel void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1, ptr addrspace(1) inreg readnone captures(none) %2, ptr addrspace(1) inreg readnone captures(none) %3) local_unnamed_addr #0 !dbg !4 {
-  %5 = tail call i32 @llvm.amdgcn.workitem.id.x(), !dbg !7
-  %6 = and i32 %5, 255, !dbg !7
-  %7 = icmp eq i32 %6, 0, !dbg !7
-  br i1 %7, label %8, label %10, !dbg !7
-
-8:                                                ; preds = %4
-  %9 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6
-  store <1 x float> %9, ptr addrspace(1) %1, align 4, !dbg !7
-  br label %10, !dbg !7
-
-10:                                               ; preds = %8, %4
-  ret void, !dbg !9
-}
-
-; Function Attrs: alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
-
-attributes #0 = { alwaysinline mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "denormal-fp-math-f32"="ieee" "uniform-work-group-size"="false" }
-attributes #1 = { alwaysinline nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
-!1 = !DIFile(filename: "test.py", directory: "/path")
-!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
-!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
-!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
-!6 = !{}
-!7 = !DILocation(line: 9, column: 20, scope: !4)
-!8 = !DILocation(line: 8, column: 16, scope: !4)
-!9 = !DILocation(line: 9, column: 4, scope: !4)
diff --git a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
new file mode 100644
index 0000000000000..14ce1d9cba098
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+
+; CHECK-LABEL:  test:
+; CHECK:        .loc    1 8 16 prologue_end             ; test.py:8:16
+; CHECK-NEXT:   s_load_dword
+
+define void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1) local_unnamed_addr !dbg !4 {
+  %3 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6
+  store <1 x float> %3, ptr addrspace(1) %1, align 4, !dbg !7
+
+  ret void, !dbg !9
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "triton", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "test.py", directory: "/path")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!4 = distinct !DISubprogram(name: "test", linkageName: "test", scope: !1, file: !1, line: 7, type: !5, scopeLine: 7, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
+!5 = !DISubroutineType(cc: DW_CC_normal, types: !6)
+!6 = !{}
+!7 = !DILocation(line: 9, column: 20, scope: !4)
+!8 = !DILocation(line: 8, column: 16, scope: !4)
+!9 = !DILocation(line: 9, column: 4, scope: !4)
>From c255ce47fe1b4050a3c900fddfb789b0e3b6496d Mon Sep 17 00:00:00 2001
From: "jian.wu" <jian.wu at amd.com>
Date: Tue, 23 Sep 2025 19:14:30 +0800
Subject: [PATCH 3/3] Simplify the test case
---
 llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
index 14ce1d9cba098..0bb3d383248fb 100644
--- a/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
+++ b/llvm/test/DebugInfo/AMDGPU/combine-conv-load.ll
@@ -4,10 +4,9 @@
 ; CHECK:        .loc    1 8 16 prologue_end             ; test.py:8:16
 ; CHECK-NEXT:   s_load_dword
 
-define void @test(ptr addrspace(1) inreg readonly captures(none) %0, ptr addrspace(1) inreg writeonly captures(none) %1) local_unnamed_addr !dbg !4 {
-  %3 = load <1 x float>, ptr addrspace(1) %0, align 4, !dbg !8, !amdgpu.noclobber !6
-  store <1 x float> %3, ptr addrspace(1) %1, align 4, !dbg !7
-
+define void @test(ptr addrspace(1) inreg readonly captures(none) %arg0, ptr addrspace(1) inreg writeonly captures(none) %arg1) local_unnamed_addr !dbg !4 {
+  %ld = load <1 x float>, ptr addrspace(1) %arg0, align 4, !dbg !8, !amdgpu.noclobber !6
+  store <1 x float> %ld, ptr addrspace(1) %arg1, align 4, !dbg !7
   ret void, !dbg !9
 }
 
    
    
More information about the llvm-commits
mailing list