[llvm] [AMDGPU] Fixed llvm-debuginfo-analyzer for AMDGPU. (PR #145125)

Adam Yang via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 28 14:50:36 PDT 2025


https://github.com/adam-yang updated https://github.com/llvm/llvm-project/pull/145125

>From 3254815539b36049fb074bf3cb18ee8bbb2ffdb6 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Fri, 20 Jun 2025 16:52:37 -0700
Subject: [PATCH 1/4] Made llvm-debuginfo-analyzer work for AMDGPU. A few
 changes to generate DWARF correctly in AMDGPU

---
 .../LogicalView/Readers/LVBinaryReader.h      |   3 +-
 .../LogicalView/Readers/LVBinaryReader.cpp    |   4 +-
 .../LogicalView/Readers/LVCodeViewReader.cpp  |  10 +-
 .../LogicalView/Readers/LVDWARFReader.cpp     |  12 +-
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  |   3 +-
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |   3 +
 .../llvm-debuginfo-analyzer/DWARF/amdgpu.ll   | 103 ++++++++++++++++++
 7 files changed, 127 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll

diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
index 1847fa8323480..1f8b884bc1b5d 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
@@ -159,7 +159,8 @@ class LVBinaryReader : public LVReader {
   LVAddress WasmCodeSectionOffset = 0;
 
   // Loads all info for the architecture of the provided object file.
-  Error loadGenericTargetInfo(StringRef TheTriple, StringRef TheFeatures);
+  Error loadGenericTargetInfo(StringRef TheTriple, StringRef TheFeatures,
+                              StringRef CPU);
 
   virtual void mapRangeAddress(const object::ObjectFile &Obj) {}
   virtual void mapRangeAddress(const object::ObjectFile &Obj,
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 80b4185b7c600..414f0f3efc82d 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -275,7 +275,8 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) {
 }
 
 Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
-                                            StringRef TheFeatures) {
+                                            StringRef TheFeatures,
+                                            StringRef CPU) {
   std::string TargetLookupError;
   const Target *TheTarget =
       TargetRegistry::lookupTarget(TheTriple, TargetLookupError);
@@ -298,7 +299,6 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
   MAI.reset(AsmInfo);
 
   // Target subtargets.
-  StringRef CPU;
   MCSubtargetInfo *SubtargetInfo(
       TheTarget->createMCSubtargetInfo(TheTriple, CPU, TheFeatures));
   if (!SubtargetInfo)
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index e5895516b5e77..2ff70816b4bf1 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -1190,7 +1190,12 @@ Error LVCodeViewReader::loadTargetInfo(const ObjectFile &Obj) {
     FeaturesValue = SubtargetFeatures();
   }
   FeaturesValue = *Features;
-  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+  StringRef CPU;
+  if (auto OptCPU = Obj.tryGetCPUName())
+    CPU = *OptCPU;
+
+  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
 }
 
 Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
@@ -1200,8 +1205,9 @@ Error LVCodeViewReader::loadTargetInfo(const PDBFile &Pdb) {
   TT.setOS(Triple::Win32);
 
   StringRef TheFeature = "";
+  StringRef TheCPU = "";
 
-  return loadGenericTargetInfo(TT.str(), TheFeature);
+  return loadGenericTargetInfo(TT.str(), TheFeature, TheCPU);
 }
 
 std::string LVCodeViewReader::getRegisterName(LVSmall Opcode,
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index 696e2bc948a2e..62134dfdadf46 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -956,10 +956,7 @@ LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
 Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
   // Detect the architecture from the object file. We usually don't need OS
   // info to lookup a target and create register info.
-  Triple TT;
-  TT.setArch(Triple::ArchType(Obj.getArch()));
-  TT.setVendor(Triple::UnknownVendor);
-  TT.setOS(Triple::UnknownOS);
+  Triple TT = Obj.makeTriple();
 
   // Features to be passed to target/subtarget
   Expected<SubtargetFeatures> Features = Obj.getFeatures();
@@ -969,7 +966,12 @@ Error LVDWARFReader::loadTargetInfo(const ObjectFile &Obj) {
     FeaturesValue = SubtargetFeatures();
   }
   FeaturesValue = *Features;
-  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString());
+
+  StringRef CPU;
+  if (auto OptCPU = Obj.tryGetCPUName())
+    CPU = *OptCPU;
+
+  return loadGenericTargetInfo(TT.str(), FeaturesValue.getString(), CPU);
 }
 
 void LVDWARFReader::mapRangeAddress(const ObjectFile &Obj) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 8f89168754180..bf390e836078e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -226,7 +226,8 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T, const Triple &TT)
       : AMDGPUAsmBackend(T), Is64Bit(TT.isAMDGCN()),
-        HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
+        HasRelocationAddend(TT.getOS() == Triple::AMDHSA ||
+                            TT.getOS() == Triple::AMDPAL) {
     switch (TT.getOS()) {
     case Triple::AMDHSA:
       OSABI = ELF::ELFOSABI_AMDGPU_HSA;
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 205a45a045a42..469a6525b4ac0 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -130,6 +130,9 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
         if (VirtReg.isPhysical())
           continue;
 
+        if (MI.isDebugInstr() && VirtReg == AMDGPU::NoRegister)
+          continue;
+
         if (!VRM->hasPhys(VirtReg))
           continue;
 
diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll
new file mode 100644
index 0000000000000..1d031979309a6
--- /dev/null
+++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll
@@ -0,0 +1,103 @@
+; RUN: llc %s -o %t.o -mcpu=gfx1030 -filetype=obj -O0
+; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s
+
+; This test compiles this module with AMDGPU backend under -O0,
+; and makes sure llvm-debuginfo-analzyer works for it.
+
+; Simple checks to make sure llvm-debuginfo-analzyer didn't fail early.
+; CHECK: Logical View:
+; CHECK: {CompileUnit}
+; CHECK: {Code} 's_endpgm'
+
+source_filename = "module"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-p10:32:32-p11:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p32:32:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32"
+target triple = "amdgcn-amd-amdpal"
+
+%dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+
+; Function Attrs: memory(readwrite)
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !14 {
+  %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i64 0, !dbg !28
+  %WorkgroupId.i0 = extractelement <3 x i32> %WorkgroupId, i64 0, !dbg !28
+  %1 = call i64 @llvm.amdgcn.s.getpc(), !dbg !28
+  %2 = shl i32 %WorkgroupId.i0, 6, !dbg !28
+  %3 = add i32 %LocalInvocationId.i0, %2, !dbg !28
+    #dbg_value(i32 %3, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !28)
+  %4 = and i64 %1, -4294967296, !dbg !30
+  %5 = zext i32 %userdata4 to i64, !dbg !30
+  %6 = or disjoint i64 %4, %5, !dbg !30
+  %7 = inttoptr i64 %6 to ptr addrspace(4), !dbg !30
+  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %7, i32 4), "dereferenceable"(ptr addrspace(4) %7, i32 -1) ], !dbg !30
+  %8 = load <4 x i32>, ptr addrspace(4) %7, align 4, !dbg !30, !invariant.load !2
+  %9 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %8, i32 %3, i32 0, i32 0, i32 0), !dbg !30
+    #dbg_value(%dx.types.ResRet.f32 poison, !31, !DIExpression(), !32)
+  %10 = fmul reassoc arcp contract afn float %9, 2.000000e+00, !dbg !33
+    #dbg_value(float %10, !34, !DIExpression(), !35)
+  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %7, i32 4), "dereferenceable"(ptr addrspace(4) %7, i32 -1) ], !dbg !36
+  %11 = getelementptr i8, ptr addrspace(4) %7, i64 32, !dbg !36
+  %.upto01 = insertelement <4 x float> poison, float %10, i64 0, !dbg !36
+  %12 = shufflevector <4 x float> %.upto01, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !36
+  %13 = load <4 x i32>, ptr addrspace(4) %11, align 4, !dbg !36, !invariant.load !2
+  call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %12, <4 x i32> %13, i32 %3, i32 0, i32 0, i32 0), !dbg !36
+  ret void, !dbg !37
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i64 @llvm.amdgcn.s.getpc() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.assume(i1 noundef) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #4
+
+attributes #0 = { memory(readwrite) "amdgpu-flat-work-group-size"="64,64" "amdgpu-memory-bound"="false" "amdgpu-num-sgpr"="4294967295" "amdgpu-num-vgpr"="4294967295" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="1200" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(read) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!12, !13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "dxcoob 1.7.2308.16 (52da17e29)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+!1 = !DIFile(filename: "tests\\basic_var.hlsl", directory: "")
+!2 = !{}
+!3 = !{!4, !10}
+!4 = distinct !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!5 = !DIGlobalVariable(name: "u0", linkageName: "\01?u0@@3V?$RWBuffer at M@@A", scope: !0, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true)
+!6 = !DICompositeType(tag: DW_TAG_class_type, name: "RWBuffer<float>", file: !1, line: 2, size: 32, align: 32, elements: !2, templateParams: !7)
+!7 = !{!8}
+!8 = !DITemplateTypeParameter(name: "element", type: !9)
+!9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!10 = distinct !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+!11 = !DIGlobalVariable(name: "u1", linkageName: "\01?u1@@3V?$RWBuffer at M@@A", scope: !0, file: !1, line: 3, type: !6, isLocal: false, isDefinition: true)
+!12 = !{i32 2, !"Dwarf Version", i32 5}
+!13 = !{i32 2, !"Debug Info Version", i32 3}
+!14 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !15, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null, !17}
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint3", file: !1, baseType: !18)
+!18 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<unsigned int, 3>", file: !1, size: 96, align: 32, elements: !19, templateParams: !24)
+!19 = !{!20, !22, !23}
+!20 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !18, file: !1, baseType: !21, size: 32, align: 32, flags: DIFlagPublic)
+!21 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+!22 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+!24 = !{!25, !26}
+!25 = !DITemplateTypeParameter(name: "element", type: !21)
+!26 = !DITemplateValueParameter(name: "element_count", type: !27, value: i32 3)
+!27 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!28 = !DILocation(line: 7, column: 17, scope: !14)
+!29 = !DILocalVariable(name: "dtid", arg: 1, scope: !14, file: !1, line: 7, type: !17)
+!30 = !DILocation(line: 11, column: 18, scope: !14)
+!31 = !DILocalVariable(name: "my_var", scope: !14, file: !1, line: 11, type: !9)
+!32 = !DILocation(line: 11, column: 9, scope: !14)
+!33 = !DILocation(line: 14, column: 26, scope: !14)
+!34 = !DILocalVariable(name: "my_var2", scope: !14, file: !1, line: 14, type: !9)
+!35 = !DILocation(line: 14, column: 9, scope: !14)
+!36 = !DILocation(line: 17, column: 14, scope: !14)
+!37 = !DILocation(line: 19, column: 1, scope: !14)
\ No newline at end of file

>From c6bacae4803be21e4204a202d4c2b1c4a5559bb6 Mon Sep 17 00:00:00 2001
From: Adam Yang <31109344+adam-yang at users.noreply.github.com>
Date: Fri, 20 Jun 2025 18:21:29 -0700
Subject: [PATCH 2/4] Moved the test to amdgpu target tests

---
 .../AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll}                     | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/{tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll => CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll} (100%)

diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
similarity index 100%
rename from llvm/test/tools/llvm-debuginfo-analyzer/DWARF/amdgpu.ll
rename to llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll

>From a4277f436bbfdbabeac50aa84c8c8d7f2c97b116 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Fri, 25 Jul 2025 12:00:53 -0700
Subject: [PATCH 3/4] Addressed feedback

---
 .../LogicalView/Readers/LVBinaryReader.h      |   2 +-
 .../LogicalView/Readers/LVBinaryReader.cpp    |   4 +-
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  |   3 +-
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |   4 +-
 .../AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll  |  12 +-
 .../si-pre-allocate-wwwmregs-dbg-noreg.mir    | 210 ++++++++++++++++++
 6 files changed, 222 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir

diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
index 1f8b884bc1b5d..2cf4a8ec6a37f 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
@@ -160,7 +160,7 @@ class LVBinaryReader : public LVReader {
 
   // Loads all info for the architecture of the provided object file.
   Error loadGenericTargetInfo(StringRef TheTriple, StringRef TheFeatures,
-                              StringRef CPU);
+                              StringRef TheCPU);
 
   virtual void mapRangeAddress(const object::ObjectFile &Obj) {}
   virtual void mapRangeAddress(const object::ObjectFile &Obj,
diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
index 414f0f3efc82d..0df9137a3bd37 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVBinaryReader.cpp
@@ -276,7 +276,7 @@ void LVBinaryReader::mapVirtualAddress(const object::COFFObjectFile &COFFObj) {
 
 Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
                                             StringRef TheFeatures,
-                                            StringRef CPU) {
+                                            StringRef TheCPU) {
   std::string TargetLookupError;
   const Target *TheTarget =
       TargetRegistry::lookupTarget(TheTriple, TargetLookupError);
@@ -300,7 +300,7 @@ Error LVBinaryReader::loadGenericTargetInfo(StringRef TheTriple,
 
   // Target subtargets.
   MCSubtargetInfo *SubtargetInfo(
-      TheTarget->createMCSubtargetInfo(TheTriple, CPU, TheFeatures));
+      TheTarget->createMCSubtargetInfo(TheTriple, TheCPU, TheFeatures));
   if (!SubtargetInfo)
     return createStringError(errc::invalid_argument,
                              "no subtarget info for target " + TheTriple);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index bf390e836078e..8f89168754180 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -226,8 +226,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T, const Triple &TT)
       : AMDGPUAsmBackend(T), Is64Bit(TT.isAMDGCN()),
-        HasRelocationAddend(TT.getOS() == Triple::AMDHSA ||
-                            TT.getOS() == Triple::AMDPAL) {
+        HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
     switch (TT.getOS()) {
     case Triple::AMDHSA:
       OSABI = ELF::ELFOSABI_AMDGPU_HSA;
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 469a6525b4ac0..f807c567efa2f 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -130,8 +130,10 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
         if (VirtReg.isPhysical())
           continue;
 
-        if (MI.isDebugInstr() && VirtReg == AMDGPU::NoRegister)
+        if (!VirtReg.isValid()) {
+          assert(MI.isDebugInstr() && "non-debug use of noreg");
           continue;
+        }
 
         if (!VRM->hasPhys(VirtReg))
           continue;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
index 1d031979309a6..2cff21c66172d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
@@ -2,11 +2,14 @@
 ; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s
 
 ; This test compiles this module with AMDGPU backend under -O0,
-; and makes sure llvm-debuginfo-analzyer works for it.
+; and makes sure llvm-debuginfo-analyzer works for it.
 
 ; Simple checks to make sure llvm-debuginfo-analzyer didn't fail early.
 ; CHECK: Logical View:
 ; CHECK: {CompileUnit}
+; CHECK-DAG: {Parameter} 'dtid' -> [0x{{[a-f0-9]+}}]'uint3'
+; CHECK-DAG: {Variable} 'my_var2' -> [0x{{[a-f0-9]+}}]'float'
+; CHECK-DAG: {Line} {{.+}}basic_var.hlsl
 ; CHECK: {Code} 's_endpgm'
 
 source_filename = "module"
@@ -15,7 +18,6 @@ target triple = "amdgcn-amd-amdpal"
 
 %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
 
-; Function Attrs: memory(readwrite)
 define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !14 {
   %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i64 0, !dbg !28
   %WorkgroupId.i0 = extractelement <3 x i32> %WorkgroupId, i64 0, !dbg !28
@@ -42,16 +44,12 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable,
   ret void, !dbg !37
 }
 
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i64 @llvm.amdgcn.s.getpc() #1
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
 declare void @llvm.assume(i1 noundef) #2
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
 declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #3
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #4
 
 attributes #0 = { memory(readwrite) "amdgpu-flat-work-group-size"="64,64" "amdgpu-memory-bound"="false" "amdgpu-num-sgpr"="4294967295" "amdgpu-num-vgpr"="4294967295" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="1200" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
@@ -100,4 +98,4 @@ attributes #4 = { nocallback nofree nosync nounwind willreturn memory(read) }
 !34 = !DILocalVariable(name: "my_var2", scope: !14, file: !1, line: 14, type: !9)
 !35 = !DILocation(line: 14, column: 9, scope: !14)
 !36 = !DILocation(line: 17, column: 14, scope: !14)
-!37 = !DILocation(line: 19, column: 1, scope: !14)
\ No newline at end of file
+!37 = !DILocation(line: 19, column: 1, scope: !14)
diff --git a/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir b/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir
new file mode 100644
index 0000000000000..4b5fea863289b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-pre-allocate-wwwmregs-dbg-noreg.mir
@@ -0,0 +1,210 @@
+# RUN: llc %s -o - -mcpu=gfx1030 -O0 -run-pass=si-pre-allocate-wwm-regs | FileCheck %s
+
+# Simple regression test to make sure DBG_VALUE $noreg does not assert in the pass
+
+# CHECK: S_ENDPGM
+
+--- |
+  source_filename = "module"
+  target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+  target triple = "amdgcn-amd-amdpal"
+
+  %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
+
+  define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !14 {
+    %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i64 0, !dbg !28
+    %WorkgroupId.i0 = extractelement <3 x i32> %WorkgroupId, i64 0, !dbg !28
+    %1 = call i64 @llvm.amdgcn.s.getpc(), !dbg !28
+    %2 = shl i32 %WorkgroupId.i0, 6, !dbg !28
+    %3 = add i32 %LocalInvocationId.i0, %2, !dbg !28
+      #dbg_value(i32 %3, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !28)
+    %4 = and i64 %1, -4294967296, !dbg !30
+    %5 = zext i32 %userdata4 to i64, !dbg !30
+    %6 = or disjoint i64 %4, %5, !dbg !30
+    %7 = inttoptr i64 %6 to ptr addrspace(4), !dbg !30, !amdgpu.uniform !2
+    %8 = load <4 x i32>, ptr addrspace(4) %7, align 4, !dbg !30, !invariant.load !2
+    %9 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %8, i32 %3, i32 0, i32 0, i32 0), !dbg !30
+      #dbg_value(%dx.types.ResRet.f32 poison, !31, !DIExpression(), !32)
+    %10 = fmul reassoc arcp contract afn float %9, 2.000000e+00, !dbg !33
+      #dbg_value(float %10, !34, !DIExpression(), !35)
+    %11 = getelementptr i8, ptr addrspace(4) %7, i64 32, !dbg !36, !amdgpu.uniform !2
+    %.upto01 = insertelement <4 x float> poison, float %10, i64 0, !dbg !36
+    %12 = shufflevector <4 x float> %.upto01, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !36
+    %13 = load <4 x i32>, ptr addrspace(4) %11, align 4, !dbg !36, !invariant.load !2
+    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %12, <4 x i32> %13, i32 %3, i32 0, i32 0, i32 0), !dbg !36
+    ret void, !dbg !37
+  }
+
+  declare noundef i64 @llvm.amdgcn.s.getpc() #1
+  declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #3
+  declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #4
+
+  attributes #0 = { memory(readwrite) "amdgpu-flat-work-group-size"="64,64" "amdgpu-memory-bound"="false" "amdgpu-num-sgpr"="4294967295" "amdgpu-num-vgpr"="4294967295" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="1200" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "target-cpu"="gfx1030" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx1030" }
+  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx1030" }
+  attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) "target-cpu"="gfx1030" }
+  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(read) "target-cpu"="gfx1030" }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!12, !13}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "dxcoob 1.7.2308.16 (52da17e29)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+  !1 = !DIFile(filename: "tests\\basic_var.hlsl", directory: "")
+  !2 = !{}
+  !3 = !{!4, !10}
+  !4 = distinct !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+  !5 = !DIGlobalVariable(name: "u0", linkageName: "\01?u0@@3V?$RWBuffer at M@@A", scope: !0, file: !1, line: 2, type: !6, isLocal: false, isDefinition: true)
+  !6 = !DICompositeType(tag: DW_TAG_class_type, name: "RWBuffer<float>", file: !1, line: 2, size: 32, align: 32, elements: !2, templateParams: !7)
+  !7 = !{!8}
+  !8 = !DITemplateTypeParameter(name: "element", type: !9)
+  !9 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+  !10 = distinct !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
+  !11 = !DIGlobalVariable(name: "u1", linkageName: "\01?u1@@3V?$RWBuffer at M@@A", scope: !0, file: !1, line: 3, type: !6, isLocal: false, isDefinition: true)
+  !12 = !{i32 2, !"Dwarf Version", i32 5}
+  !13 = !{i32 2, !"Debug Info Version", i32 3}
+  !14 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !15, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
+  !15 = !DISubroutineType(types: !16)
+  !16 = !{null, !17}
+  !17 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint3", file: !1, baseType: !18)
+  !18 = !DICompositeType(tag: DW_TAG_class_type, name: "vector<unsigned int, 3>", file: !1, size: 96, align: 32, elements: !19, templateParams: !24)
+  !19 = !{!20, !22, !23}
+  !20 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !18, file: !1, baseType: !21, size: 32, align: 32, flags: DIFlagPublic)
+  !21 = !DIBasicType(name: "unsigned int", size: 32, align: 32, encoding: DW_ATE_unsigned)
+  !22 = !DIDerivedType(tag: DW_TAG_member, name: "y", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 32, flags: DIFlagPublic)
+  !23 = !DIDerivedType(tag: DW_TAG_member, name: "z", scope: !18, file: !1, baseType: !21, size: 32, align: 32, offset: 64, flags: DIFlagPublic)
+  !24 = !{!25, !26}
+  !25 = !DITemplateTypeParameter(name: "element", type: !21)
+  !26 = !DITemplateValueParameter(name: "element_count", type: !27, value: i32 3)
+  !27 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !28 = !DILocation(line: 7, column: 17, scope: !14)
+  !29 = !DILocalVariable(name: "dtid", arg: 1, scope: !14, file: !1, line: 7, type: !17)
+  !30 = !DILocation(line: 11, column: 18, scope: !14)
+  !31 = !DILocalVariable(name: "my_var", scope: !14, file: !1, line: 11, type: !9)
+  !32 = !DILocation(line: 11, column: 9, scope: !14)
+  !33 = !DILocation(line: 14, column: 26, scope: !14)
+  !34 = !DILocalVariable(name: "my_var2", scope: !14, file: !1, line: 14, type: !9)
+  !35 = !DILocation(line: 14, column: 9, scope: !14)
+  !36 = !DILocation(line: 17, column: 14, scope: !14)
+  !37 = !DILocation(line: 19, column: 1, scope: !14)
+...
+---
+name:            _amdgpu_cs_main
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  explicitKernArgSize: 0
+  maxKernArgAlign: 4
+  ldsSize:         0
+  gdsSize:         0
+  dynLDSAlign:     1
+  isEntryFunction: true
+  isChainFunction: false
+  noSignedZerosFPMath: false
+  memoryBound:     false
+  waveLimiter:     false
+  hasSpilledSGPRs: true
+  hasSpilledVGPRs: false
+  scratchRSrcReg:  '$private_rsrc_reg'
+  frameOffsetReg:  '$fp_reg'
+  stackPtrOffsetReg: '$sgpr32'
+  bytesInStackArgArea: 0
+  returnsVoid:     true
+  argumentInfo:
+    privateSegmentWaveByteOffset: { reg: '$sgpr6' }
+  psInputAddr:     0
+  psInputEnable:   0
+  maxMemoryClusterDWords: 8
+  mode:
+    ieee:            false
+    dx10-clamp:      true
+    fp32-input-denormals: false
+    fp32-output-denormals: false
+    fp64-fp16-input-denormals: true
+    fp64-fp16-output-denormals: true
+  highBitsOf32BitAddress: 0
+  occupancy:       16
+  vgprForAGPRCopy: ''
+  sgprForEXECCopy: '$sgpr12_sgpr13'
+  longBranchReservedReg: ''
+  hasInitWholeWave: false
+  dynamicVGPRBlockSize: 0
+  scratchReservedForDynamicVGPRs: 0
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+    %8:vgpr_32 = COPY killed $vgpr2
+    %7:vgpr_32 = COPY killed $vgpr1
+    %6:vgpr_32 = COPY killed $vgpr0
+    renamable $sgpr0 = COPY killed $sgpr4
+    %39:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr3, 0, %39
+    renamable $sgpr3 = COPY killed $sgpr2
+    renamable $sgpr2 = COPY $sgpr1
+    $sgpr1 = SI_RESTORE_S32_FROM_VGPR %39, 0
+    dead renamable $sgpr4 = IMPLICIT_DEF
+    dead renamable $sgpr4 = IMPLICIT_DEF
+    dead renamable $sgpr4 = IMPLICIT_DEF
+    undef %38.sub0:vreg_96 = COPY %6
+    %38.sub1:vreg_96 = COPY %7
+    dead %38.sub2:vreg_96 = COPY %8
+    undef renamable $sgpr4 = COPY renamable $sgpr3, implicit-def $sgpr4_sgpr5_sgpr6
+    renamable $sgpr5 = COPY killed renamable $sgpr1
+    renamable $sgpr6 = COPY killed renamable $sgpr0
+    dead renamable $sgpr8_sgpr9_sgpr10 = IMPLICIT_DEF
+    renamable $sgpr0_sgpr1 = S_GETPC_B64_pseudo debug-location !28
+    renamable $sgpr4 = S_MOV_B32 6
+    %16:vgpr_32 = V_LSHL_ADD_U32_e64 killed $sgpr3, killed $sgpr4, %6, implicit $exec,  debug-location !28
+    DBG_VALUE %16, $noreg, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32),  debug-location !28
+    renamable $sgpr3 = S_MOV_B32 -1
+    renamable $sgpr4 = S_MOV_B32 0
+    undef renamable $sgpr6 = COPY renamable $sgpr4, implicit-def $sgpr6_sgpr7
+    renamable $sgpr7 = COPY killed renamable $sgpr3
+    renamable $sgpr0_sgpr1 = S_AND_B64 killed renamable $sgpr0_sgpr1, killed renamable $sgpr6_sgpr7, implicit-def dead $scc,  debug-location !30
+    renamable $sgpr5 = S_MOV_B32 0,  debug-location !30
+    undef renamable $sgpr2 = COPY killed renamable $sgpr2, implicit-def $sgpr2_sgpr3,  debug-location !30
+    renamable $sgpr3 = COPY killed renamable $sgpr5,  debug-location !30
+    renamable $sgpr0_sgpr1 = disjoint S_OR_B64 killed renamable $sgpr0_sgpr1, killed renamable $sgpr2_sgpr3, implicit-def dead $scc,  debug-location !30
+    renamable $sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0,  debug-location !30 :: (invariant load (s128) from %ir.7, align 4, addrspace 4)
+    renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr0_sgpr1, 32, 0,  debug-location !36 :: (invariant load (s128) from %ir.11, align 4, addrspace 4)
+    %26:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %16, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr4, 0, 0, 0, implicit $exec,  debug-location !30 :: (dereferenceable load (s32), align 1, addrspace 8)
+    DBG_VALUE $noreg, $noreg, !31, !DIExpression(),  debug-location !32
+    %27:vgpr_32 = arcp contract afn reassoc nofpexcept V_ADD_F32_e64 0, %26, 0, %26, 0, 0, implicit $mode, implicit $exec,  debug-location !33
+    DBG_VALUE %27, $noreg, !34, !DIExpression(),  debug-location !35
+    dead renamable $sgpr5 = IMPLICIT_DEF debug-location !36
+    dead renamable $sgpr5 = IMPLICIT_DEF debug-location !36
+    dead renamable $sgpr5 = IMPLICIT_DEF debug-location !36
+    dead renamable $sgpr5 = IMPLICIT_DEF debug-location !36
+    undef %37.sub0:vreg_128 = COPY %27,  debug-location !36
+    %37.sub1:vreg_128 = COPY %27,  debug-location !36
+    %37.sub2:vreg_128 = COPY %27,  debug-location !36
+    %37.sub3:vreg_128 = COPY %27,  debug-location !36
+    %29:vreg_128 = COPY %37,  debug-location !36
+    BUFFER_STORE_FORMAT_XYZW_IDXEN_exact %29, %16, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec,  debug-location !36 :: (dereferenceable store (s128), align 1, addrspace 8)
+    S_ENDPGM 0,  debug-location !37
+...

>From c938125a081f5335948c39a915555261eabdeda5 Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang at microsoft.com>
Date: Mon, 28 Jul 2025 14:50:18 -0700
Subject: [PATCH 4/4] Addressed feedback

---
 .../AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll  | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
index 2cff21c66172d..89fc6c062c29d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll
@@ -13,7 +13,6 @@
 ; CHECK: {Code} 's_endpgm'
 
 source_filename = "module"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-p10:32:32-p11:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9-p32:32:32-v8:8-v16:16-v32:32-v48:32-v64:32-v80:32-v96:32-v112:32-v128:32-v144:32-v160:32-v176:32-v192:32-v208:32-v224:32-v240:32-v256:32-i1:32-i8:8-i16:16-i32:32-i64:32-f16:16-f32:32-f64:32"
 target triple = "amdgcn-amd-amdpal"
 
 %dx.types.ResRet.f32 = type { float, float, float, float, i32 }
@@ -21,26 +20,26 @@ target triple = "amdgcn-amd-amdpal"
 define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg noundef %globalTable, i32 inreg noundef %userdata4, <3 x i32> inreg noundef %WorkgroupId, i32 inreg noundef %MultiDispatchInfo, <3 x i32> noundef %LocalInvocationId) #0 !dbg !14 {
   %LocalInvocationId.i0 = extractelement <3 x i32> %LocalInvocationId, i64 0, !dbg !28
   %WorkgroupId.i0 = extractelement <3 x i32> %WorkgroupId, i64 0, !dbg !28
-  %1 = call i64 @llvm.amdgcn.s.getpc(), !dbg !28
-  %2 = shl i32 %WorkgroupId.i0, 6, !dbg !28
-  %3 = add i32 %LocalInvocationId.i0, %2, !dbg !28
-    #dbg_value(i32 %3, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !28)
-  %4 = and i64 %1, -4294967296, !dbg !30
-  %5 = zext i32 %userdata4 to i64, !dbg !30
-  %6 = or disjoint i64 %4, %5, !dbg !30
-  %7 = inttoptr i64 %6 to ptr addrspace(4), !dbg !30
-  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %7, i32 4), "dereferenceable"(ptr addrspace(4) %7, i32 -1) ], !dbg !30
-  %8 = load <4 x i32>, ptr addrspace(4) %7, align 4, !dbg !30, !invariant.load !2
-  %9 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %8, i32 %3, i32 0, i32 0, i32 0), !dbg !30
+  %pc = call i64 @llvm.amdgcn.s.getpc(), !dbg !28
+  %offset = shl i32 %WorkgroupId.i0, 6, !dbg !28
+  %dtid = add i32 %LocalInvocationId.i0, %offset, !dbg !28
+    #dbg_value(i32 %dtid, !29, !DIExpression(DW_OP_LLVM_fragment, 0, 32), !28)
+  %pc_hi = and i64 %pc, -4294967296, !dbg !30
+  %zext = zext i32 %userdata4 to i64, !dbg !30
+  %ptr_val = or disjoint i64 %pc_hi, %zext, !dbg !30
+  %ptr = inttoptr i64 %ptr_val to ptr addrspace(4), !dbg !30
+  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %ptr, i32 4), "dereferenceable"(ptr addrspace(4) %ptr, i32 -1) ], !dbg !30
+  %uav_0 = load <4 x i32>, ptr addrspace(4) %ptr, align 4, !dbg !30, !invariant.load !2
+  %uav_load_1 = call float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32> %uav_0, i32 %dtid, i32 0, i32 0, i32 0), !dbg !30
     #dbg_value(%dx.types.ResRet.f32 poison, !31, !DIExpression(), !32)
-  %10 = fmul reassoc arcp contract afn float %9, 2.000000e+00, !dbg !33
-    #dbg_value(float %10, !34, !DIExpression(), !35)
-  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %7, i32 4), "dereferenceable"(ptr addrspace(4) %7, i32 -1) ], !dbg !36
-  %11 = getelementptr i8, ptr addrspace(4) %7, i64 32, !dbg !36
-  %.upto01 = insertelement <4 x float> poison, float %10, i64 0, !dbg !36
-  %12 = shufflevector <4 x float> %.upto01, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !36
-  %13 = load <4 x i32>, ptr addrspace(4) %11, align 4, !dbg !36, !invariant.load !2
-  call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %12, <4 x i32> %13, i32 %3, i32 0, i32 0, i32 0), !dbg !36
+  %mul = fmul reassoc arcp contract afn float %uav_load_1, 2.000000e+00, !dbg !33
+    #dbg_value(float %mul, !34, !DIExpression(), !35)
+  call void @llvm.assume(i1 true) [ "align"(ptr addrspace(4) %ptr, i32 4), "dereferenceable"(ptr addrspace(4) %ptr, i32 -1) ], !dbg !36
+  %uav_1_ptr = getelementptr i8, ptr addrspace(4) %ptr, i64 32, !dbg !36
+  %.upto01 = insertelement <4 x float> poison, float %mul, i64 0, !dbg !36
+  %filled_vector = shufflevector <4 x float> %.upto01, <4 x float> poison, <4 x i32> zeroinitializer, !dbg !36
+  %uav_1 = load <4 x i32>, ptr addrspace(4) %uav_1_ptr, align 4, !dbg !36, !invariant.load !2
+  call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> %filled_vector, <4 x i32> %uav_1, i32 %dtid, i32 0, i32 0, i32 0), !dbg !36
   ret void, !dbg !37
 }
 
@@ -52,7 +51,7 @@ declare void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float>, <4 x i32
 
 declare float @llvm.amdgcn.struct.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32 immarg) #4
 
-attributes #0 = { memory(readwrite) "amdgpu-flat-work-group-size"="64,64" "amdgpu-memory-bound"="false" "amdgpu-num-sgpr"="4294967295" "amdgpu-num-vgpr"="4294967295" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="1200" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="3" "denormal-fp-math"="ieee" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,+cumode,+enable-flat-scratch" }
+attributes #0 = { memory(readwrite) }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }



More information about the llvm-commits mailing list