[llvm] Try To Guess SGMasks for Inline Asm Instructions (PR #155491)

Mon Oct 20 15:50:17 PDT 2025

https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/155491

>From e610f9a25bf32918922f8d315d9ca2b6b7530606 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 22 Aug 2025 00:14:03 -0500
Subject: [PATCH 01/12] Attempt to add inline asm to sched group barriers

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 5700468e2420e..9d564f82dfc3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,8 +75,9 @@ enum class SchedGroupMask {
   DS_READ = 1u << 8,
   DS_WRITE = 1u << 9,
   TRANS = 1u << 10,
+  INLINE_ASM = 1u << 11,
   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
-        DS_READ | DS_WRITE | TRANS,
+        DS_READ | DS_WRITE | TRANS | INLINE_ASM,
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };
 
@@ -2440,6 +2441,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
            TII->isTRANS(MI))
     Result = true;
 
+  else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) &&
+           MI.isInlineAsm())
+    Result = true;
+
   LLVM_DEBUG(
       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
              << (Result ? " could classify " : " unable to classify ") << MI);

>From 14e3f8e9527776881c018811720be89cdd166f00 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 26 Aug 2025 15:23:02 -0500
Subject: [PATCH 02/12] Allow specifying sched group barrier masks for inline
 asm

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 9d564f82dfc3c..21ef960412507 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,9 +75,8 @@ enum class SchedGroupMask {
   DS_READ = 1u << 8,
   DS_WRITE = 1u << 9,
   TRANS = 1u << 10,
-  INLINE_ASM = 1u << 11,
   ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
-        DS_READ | DS_WRITE | TRANS | INLINE_ASM,
+        DS_READ | DS_WRITE | TRANS,
   LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
 };
 
@@ -2392,6 +2391,16 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
   if (MI.isMetaInstruction())
     Result = false;
 
+  else if (MI.isInlineAsm()) {
+    std::string Text = MI.getOperand(0).getSymbolName();
+    if (Text.find("SGMASK:") != std::string::npos) {
+      Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
+      Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
+      unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0);
+      Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
+    }
+  }
+
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
            (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
             TII->isTRANS(MI)))
@@ -2441,10 +2450,6 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
            TII->isTRANS(MI))
     Result = true;
 
-  else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) &&
-           MI.isInlineAsm())
-    Result = true;
-
   LLVM_DEBUG(
       dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
              << (Result ? " could classify " : " unable to classify ") << MI);

>From e3ed8200aab1de545b0b26fcc35de882296a167f Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 28 Aug 2025 11:17:43 -0500
Subject: [PATCH 03/12] Switch to StringRef

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 21ef960412507..0f4dd96e642ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2392,11 +2392,11 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = false;
 
   else if (MI.isInlineAsm()) {
-    std::string Text = MI.getOperand(0).getSymbolName();
+    StringRef Text = MI.getOperand(0).getSymbolName();
     if (Text.find("SGMASK:") != std::string::npos) {
       Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
       Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
-      unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0);
+      unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
       Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
     }
   }

>From 6f1d602775972eb26fec41b3d37296962654411f Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 29 Aug 2025 17:35:22 -0500
Subject: [PATCH 04/12] Add testcase

---
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 71 ++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll

diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
new file mode 100644
index 0000000000000..2f35640226fd6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -0,0 +1,71 @@
+; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
+; CHECK: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMEND
+; CHECK: v_add_f32_e32
+; ModuleID = '<stdin>'
+source_filename = "llvm-link"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
+ at __hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %mul = shl i32 %0, 6
+  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %mul, %1
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+  %mul3 = shl nsw i32 %add, 2
+  %idx.ext4 = sext i32 %mul3 to i64
+  %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
+  %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
+  %3 = extractelement <4 x float> %2, i64 3
+  %4 = extractelement <4 x float> %2, i64 0
+  %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3
+  %6 = extractelement <4 x float> %2, i64 1
+  %7 = extractelement <4 x float> %2, i64 2
+  %add6 = fadd contract float %6, %7
+  %add7 = fadd contract float %5, %add6
+  store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nofree nounwind willreturn }
+attributes #3 = { convergent nounwind memory(none) }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"omnipotent char", !2, i64 0}
+!2 = !{!"Simple C++ TBAA"}
+!3 = !{i64 129}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"float", !1, i64 0}

>From 444c33b933aacb98848684483fda728e17d7e506 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 1 Oct 2025 15:25:02 -0500
Subject: [PATCH 05/12] Guess the constraints instead of using user-provided
 hints

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp    | 43 ++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 17 ++++----
 2 files changed, 53 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 0f4dd96e642ba..2163f70bf0146 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2392,6 +2392,48 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = false;
 
   else if (MI.isInlineAsm()) {
+    auto &TRI = TII->getRegisterInfo();
+    auto &MRI = MI.getParent()->getParent()->getRegInfo();
+    bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
+         MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
+    for (const MachineOperand &Operand : MI.operands())
+      if (Operand.isReg()) {
+        auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
+        if (TRI.isVGPRClass(&RegClass))
+          VGPR_used = true;
+        if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+          VMFMA_used = true;
+        if (TRI.isSGPRClass(&RegClass))
+          SGPR_used = true;
+      }
+
+    unsigned long InlineAsmMask = 0;
+    if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
+    if (VMFMA_used)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
+    if (VGPR_used && MayLoad)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_READ;
+    if (VGPR_used && MayStore)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_WRITE;
+    if (!VGPR_used && MayLoad)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::DS_READ;
+    if (!VGPR_used && MayStore)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::DS_WRITE;
+    if (InlineAsmMask & (unsigned long)SchedGroupMask::VALU ||
+        InlineAsmMask & (unsigned long)SchedGroupMask::SALU)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::ALU;
+    if (InlineAsmMask & (unsigned long)SchedGroupMask::DS_READ ||
+        InlineAsmMask & (unsigned long)SchedGroupMask::DS_WRITE)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::DS;
+    if (InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_READ ||
+        InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_WRITE)
+      InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM;
+
+    Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
+
+    // Original implementation
+#if 0
     StringRef Text = MI.getOperand(0).getSymbolName();
     if (Text.find("SGMASK:") != std::string::npos) {
       Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
@@ -2399,6 +2441,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
       unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
       Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
     }
+#endif
   }
 
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 2f35640226fd6..89bf6ad677545 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -28,12 +28,15 @@ if.then:                                          ; preds = %entry
   %mul3 = shl nsw i32 %add, 2
   %idx.ext4 = sext i32 %mul3 to i64
   %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
-  %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
-  %3 = extractelement <4 x float> %2, i64 3
-  %4 = extractelement <4 x float> %2, i64 0
-  %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3
-  %6 = extractelement <4 x float> %2, i64 1
-  %7 = extractelement <4 x float> %2, i64 2
+  %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
+  %a20 = add i64 %idx.ext4, 2
+  %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
+  %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0
+  %3 = extractelement <2 x float> %a22, i64 1
+  %4 = extractelement <2 x float> %2, i64 0
+  %5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3
+  %6 = extractelement <2 x float> %2, i64 1
+  %7 = extractelement <2 x float> %a22, i64 0
   %add6 = fadd contract float %6, %7
   %add7 = fadd contract float %5, %add6
   store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
@@ -41,7 +44,7 @@ if.then:                                          ; preds = %entry
   tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
   tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
   tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
-  tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
   tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
   br label %if.end
 

>From cf45c2e79efcf72d814b0c5575b460eddc752789 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 7 Oct 2025 11:55:04 -0500
Subject: [PATCH 06/12] Fix error-that-should-be-warning, code may or may not
 be correct

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 2163f70bf0146..fa5e6751c371c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2410,6 +2410,8 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     unsigned long InlineAsmMask = 0;
     if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
       InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
+    if (SGPR_used && !MayLoad && !MayStore) //arsenm: should this have !VGPR_used?
+      InlineAsmMask |= (unsigned long)SchedGroupMask::SALU;
     if (VMFMA_used)
       InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
     if (VGPR_used && MayLoad)

>From 4516f96f3d2a2b7af690842cef36fddd18d09e23 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 7 Oct 2025 12:52:43 -0500
Subject: [PATCH 07/12] Update testcase; update algorithm to something maybe
 right

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp    | 4 ++--
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fa5e6751c371c..db5c3e8292836 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2408,9 +2408,9 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
       }
 
     unsigned long InlineAsmMask = 0;
-    if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+    if (VGPR_used && !SGPR_used && !VMFMA_used && !MayLoad && !MayStore)
       InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
-    if (SGPR_used && !MayLoad && !MayStore) //arsenm: should this have !VGPR_used?
+    if (SGPR_used && !MayLoad && !MayStore)
       InlineAsmMask |= (unsigned long)SchedGroupMask::SALU;
     if (VMFMA_used)
       InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 89bf6ad677545..402b1408284d5 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
 ; CHECK: v_add_f32_e32
 ; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_add_f32_e32
+; CHECK-NEXT: v_mfma_f64
 ; CHECK-NEXT: ;;#ASMEND
 ; CHECK: v_add_f32_e32
 ; ModuleID = '<stdin>'

>From 8789ad123bd45a54f6db9444a3b38f8d4a5423c7 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 14 Oct 2025 16:38:40 -0500
Subject: [PATCH 08/12] Review changes

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp    | 20 +++++-------------
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 22 ++++++--------------
 2 files changed, 11 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index db5c3e8292836..9768eb2c1b958 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2392,23 +2392,23 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     Result = false;
 
   else if (MI.isInlineAsm()) {
-    auto &TRI = TII->getRegisterInfo();
+    const SIRegisterInfo &TRI = TII->getRegisterInfo();
     auto &MRI = MI.getParent()->getParent()->getRegInfo();
     bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
          MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
     for (const MachineOperand &Operand : MI.operands())
       if (Operand.isReg()) {
         auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
-        if (TRI.isVGPRClass(&RegClass))
+        if (TRI.hasVGPRs(&RegClass))
           VGPR_used = true;
-        if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+        if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128) // > 128 bit registers are usually only used by MFMA instructions, so we're using that as a heuristic to guess the schedule group mask of the inline asm.
           VMFMA_used = true;
-        if (TRI.isSGPRClass(&RegClass))
+        if (TRI.hasSGPRs(&RegClass))
           SGPR_used = true;
       }
 
     unsigned long InlineAsmMask = 0;
-    if (VGPR_used && !SGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+    if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
       InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
     if (SGPR_used && !MayLoad && !MayStore)
       InlineAsmMask |= (unsigned long)SchedGroupMask::SALU;
@@ -2434,16 +2434,6 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
 
     Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
 
-    // Original implementation
-#if 0
-    StringRef Text = MI.getOperand(0).getSymbolName();
-    if (Text.find("SGMASK:") != std::string::npos) {
-      Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
-      Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
-      unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
-      Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
-    }
-#endif
   }
 
   else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 402b1408284d5..e9211d6651f31 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -1,16 +1,13 @@
-; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
 ; CHECK: v_add_f32_e32
 ; CHECK-NEXT: ;;#ASMSTART
 ; CHECK-NEXT: v_mfma_f64
 ; CHECK-NEXT: ;;#ASMEND
 ; CHECK: v_add_f32_e32
 ; ModuleID = '<stdin>'
-source_filename = "llvm-link"
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
 target triple = "amdgcn-amd-amdhsa"
 
 @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
- at __hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0
 
 ; Function Attrs: convergent mustprogress norecurse nounwind
 define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
@@ -28,18 +25,18 @@ if.then:                                          ; preds = %entry
   %mul3 = shl nsw i32 %add, 2
   %idx.ext4 = sext i32 %mul3 to i64
   %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
-  %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
+  %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
   %a20 = add i64 %idx.ext4, 2
   %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
-  %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0
+  %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
   %3 = extractelement <2 x float> %a22, i64 1
   %4 = extractelement <2 x float> %2, i64 0
-  %5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3
+  %5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3
   %6 = extractelement <2 x float> %2, i64 1
   %7 = extractelement <2 x float> %a22, i64 0
   %add6 = fadd contract float %6, %7
   %add7 = fadd contract float %5, %add6
-  store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
+  store float %add7, ptr addrspace(1) %add.ptr, align 4
   tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
   tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
   tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
@@ -64,11 +61,4 @@ declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg
 attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { convergent nocallback nofree nounwind willreturn }
-attributes #3 = { convergent nounwind memory(none) }
-
-!0 = !{!1, !1, i64 0}
-!1 = !{!"omnipotent char", !2, i64 0}
-!2 = !{!"Simple C++ TBAA"}
-!3 = !{i64 129}
-!4 = !{!5, !5, i64 0}
-!5 = !{!"float", !1, i64 0}
+attributes #3 = { convergent nounwind memory(none) }
\ No newline at end of file

>From 11936126fda569443c0178c87a35f31ac9134bf0 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 14 Oct 2025 16:44:25 -0500
Subject: [PATCH 09/12] Fix testcase

---
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index e9211d6651f31..64baf7a9ebdf9 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -7,8 +7,6 @@
 ; ModuleID = '<stdin>'
 target triple = "amdgcn-amd-amdhsa"
 
- at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
-
 ; Function Attrs: convergent mustprogress norecurse nounwind
 define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
 entry:
@@ -61,4 +59,4 @@ declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg
 attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { convergent nocallback nofree nounwind willreturn }
-attributes #3 = { convergent nounwind memory(none) }
\ No newline at end of file
+attributes #3 = { convergent nounwind memory(none) }

>From 4a5f90e10d940e74fffa6b45ccabadf9ae6ba8f7 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 15 Oct 2025 11:24:34 -0500
Subject: [PATCH 10/12] Review changes

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 9768eb2c1b958..fcdd98d03ff68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2395,13 +2395,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     const SIRegisterInfo &TRI = TII->getRegisterInfo();
     auto &MRI = MI.getParent()->getParent()->getRegInfo();
     bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
-         MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
+         VReg32_used = false, MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
     for (const MachineOperand &Operand : MI.operands())
       if (Operand.isReg()) {
         auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
-        if (TRI.hasVGPRs(&RegClass))
+        if (TRI.hasVGPRs(&RegClass)) {
           VGPR_used = true;
-        if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128) // > 128 bit registers are usually only used by MFMA instructions, so we're using that as a heuristic to guess the schedule group mask of the inline asm.
+          if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
+            VReg32_used = false;
+        }
+        // >= 128 bit registers are usually only used by MFMA instructions, so
+        // we're using that as a heuristic to guess the schedule group mask of
+        // the inline asm.
+        if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) >= 128)
           VMFMA_used = true;
         if (TRI.hasSGPRs(&RegClass))
           SGPR_used = true;
@@ -2415,13 +2421,12 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
     if (VMFMA_used)
       InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
     if (VGPR_used && MayLoad)
-      InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_READ;
+      InlineAsmMask |= (unsigned long)(VReg32_used ? SchedGroupMask::DS_READ
+                                                   : SchedGroupMask::VMEM_READ);
     if (VGPR_used && MayStore)
-      InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_WRITE;
-    if (!VGPR_used && MayLoad)
-      InlineAsmMask |= (unsigned long)SchedGroupMask::DS_READ;
-    if (!VGPR_used && MayStore)
-      InlineAsmMask |= (unsigned long)SchedGroupMask::DS_WRITE;
+      InlineAsmMask |=
+          (unsigned long)(VReg32_used ? SchedGroupMask::DS_WRITE
+                                      : SchedGroupMask::VMEM_WRITE);
     if (InlineAsmMask & (unsigned long)SchedGroupMask::VALU ||
         InlineAsmMask & (unsigned long)SchedGroupMask::SALU)
       InlineAsmMask |= (unsigned long)SchedGroupMask::ALU;

>From 68b749feaa3c41d8297da6296f87b5dde7378ee0 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 17 Oct 2025 17:06:01 -0500
Subject: [PATCH 11/12] Testcases and bugfix

---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp    |  2 +-
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 81 +++++++++++++++++++-
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fcdd98d03ff68..8ddc23456788d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2402,7 +2402,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
         if (TRI.hasVGPRs(&RegClass)) {
           VGPR_used = true;
           if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
-            VReg32_used = false;
+            VReg32_used = true;
         }
         // >= 128 bit registers are usually only used by MFMA instructions, so
         // we're using that as a heuristic to guess the schedule group mask of
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 64baf7a9ebdf9..510aae509cdc1 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
+; CHECK-LABEL: test_mfma
 ; CHECK: v_add_f32_e32
 ; CHECK-NEXT: ;;#ASMSTART
 ; CHECK-NEXT: v_mfma_f64
@@ -8,7 +9,43 @@
 target triple = "amdgcn-amd-amdhsa"
 
 ; Function Attrs: convergent mustprogress norecurse nounwind
-define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+define protected amdgpu_kernel void @test_valu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %mul = shl i32 %0, 6
+  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %mul, %1
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+  %mul4 = shl nsw i32 %add, 2
+  %idx.ext5 = sext i32 %mul4 to i64
+  %add.ptr6 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext5
+  %2 = load <4 x float>, ptr addrspace(1) %add.ptr6, align 16
+  %3 = extractelement <4 x float> %2, i64 3
+  %4 = extractelement <4 x float> %2, i64 0
+  %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2", "=v,v,v"(float %3, float %4) #3
+  %6 = extractelement <4 x float> %2, i64 1
+  %7 = extractelement <4 x float> %2, i64 2
+  %add7 = fadd contract float %6, %7
+  %add8 = fadd contract float %5, %add7
+  store float %add8, ptr addrspace(1) %add.ptr, align 4
+  %mul9 = mul nsw i32 %k, 3
+  store i32 %mul9, ptr addrspace(1) %ret.coerce, align 4
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 6, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 4, i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @test_mfma(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
 entry:
   %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %mul = shl i32 %0, 6
@@ -47,6 +84,45 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
+ at _ZZ16group4_sum_floatPfPKfE6cpymem = internal addrspace(3) global [8 x float] undef, align 16
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @test_ds(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %mul = shl i32 %0, 6
+  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %mul, %1
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+  %mul3 = shl nsw i32 %add, 2
+  %idx.ext4 = sext i32 %mul3 to i64
+  %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
+  %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
+  %a20 = add i64 %idx.ext4, 2
+  %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
+  %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
+  %3 = extractelement <2 x float> %a22, i64 1
+  %4 = extractelement <2 x float> %2, i64 0
+  %5 = tail call contract noundef float asm "ds_read_b32 $0, $1 offset:0", "=v,v,~{memory}"(i32 ptrtoint (ptr addrspacecast (ptr addrspace(3) @_ZZ16group4_sum_floatPfPKfE6cpymem to ptr) to i32)) #4
+  %6 = extractelement <2 x float> %2, i64 1
+  %7 = extractelement <2 x float> %a22, i64 0
+  %add6 = fadd contract float %6, %7
+  %add7 = fadd contract float %5, %add6
+  store float %add7, ptr addrspace(1) %add.ptr, align 4
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 7, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 128, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
 
@@ -56,7 +132,8 @@ declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
 ; Function Attrs: convergent nocallback nofree nounwind willreturn
 declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2
 
-attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
+attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="1" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { convergent nocallback nofree nounwind willreturn }
 attributes #3 = { convergent nounwind memory(none) }
+attributes #4 = { convergent nounwind }

>From cc7b4b078d98bda55d8f992d3193935e1ffb52c2 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Mon, 20 Oct 2025 15:07:07 -0500
Subject: [PATCH 12/12] Finalize tests

---
 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 60 ++++++++++++++++++--
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 510aae509cdc1..898e1bb466376 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -1,15 +1,12 @@
 ; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
-; CHECK-LABEL: test_mfma
-; CHECK: v_add_f32_e32
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: v_mfma_f64
-; CHECK-NEXT: ;;#ASMEND
-; CHECK: v_add_f32_e32
 ; ModuleID = '<stdin>'
 target triple = "amdgcn-amd-amdhsa"
 
 ; Function Attrs: convergent mustprogress norecurse nounwind
 define protected amdgpu_kernel void @test_valu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_valu
+; CHECK: s_mul_i32
+; CHECK: ASMSTART
 entry:
   %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %mul = shl i32 %0, 6
@@ -44,8 +41,55 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @test_salu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_salu
+; CHECK: %bb.1
+; CHECK-NEXT: s_load
+; CHECK-NEXT: s_load
+; CHECK-NEXT: s_waitcnt
+; CHECK-NEXT: ASMSTART
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+  %mul = shl i32 %0, 6
+  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %mul, %1
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+  %mul4 = shl nsw i32 %add, 2
+  %idx.ext5 = sext i32 %mul4 to i64
+  %add.ptr6 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext5
+  %2 = load <4 x float>, ptr addrspace(1) %add.ptr6, align 16
+  %3 = extractelement <4 x float> %2, i64 3
+  %4 = extractelement <4 x float> %2, i64 0
+  %5 = fadd contract float %3, %4
+  %6 = extractelement <4 x float> %2, i64 1
+  %7 = extractelement <4 x float> %2, i64 2
+  %add7 = fadd contract float %6, %7
+  %add8 = fadd contract float %5, %add7
+  store float %add8, ptr addrspace(1) %add.ptr, align 4
+  %mul9 = tail call noundef i32 asm "s_mul_i32, $0, $1, 3", "=s,s"(i32 %k) #3
+  store i32 %mul9, ptr addrspace(1) %ret.coerce, align 4
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 1, i32 0)
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 10, i32 0)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
 ; Function Attrs: convergent mustprogress norecurse nounwind
 define protected amdgpu_kernel void @test_mfma(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mfma
+; CHECK: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_mfma_f64
+; CHECK-NEXT: ;;#ASMEND
+; CHECK: v_add_f32_e32
 entry:
   %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %mul = shl i32 %0, 6
@@ -88,6 +132,10 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; Function Attrs: convergent mustprogress norecurse nounwind
 define protected amdgpu_kernel void @test_ds(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_ds
+; CHECK-DAG: v_lshl_add_u64
+; CHECK-DAG: v_add_f32_e32
+; CHECK-NEXT: ASMSTART
 entry:
   %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
   %mul = shl i32 %0, 6