[llvm] Allow Specifying SGMasks for Inline Asm (PR #155491)
Patrick Simmons via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 1 13:25:43 PDT 2025
https://github.com/linuxrocks123 updated https://github.com/llvm/llvm-project/pull/155491
>From 7af6d4c9dacf48fa29a25238dd68febc4db38dbf Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 22 Aug 2025 00:14:03 -0500
Subject: [PATCH 1/5] Attempt to add inline asm to sched group barriers
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1b08f8c..9689773872a6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,8 +75,9 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
+ INLINE_ASM = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE | TRANS,
+ DS_READ | DS_WRITE | TRANS | INLINE_ASM,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -2436,6 +2437,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;
+ else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) &&
+ MI.isInlineAsm())
+ Result = true;
+
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
>From 7b33a532f16138df3a1ed5f9bbd6180c4f6fcdf3 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Tue, 26 Aug 2025 15:23:02 -0500
Subject: [PATCH 2/5] Allow specifying sched group barrier masks for inline asm
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 9689773872a6e..8c514714bd7dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,9 +75,8 @@ enum class SchedGroupMask {
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
TRANS = 1u << 10,
- INLINE_ASM = 1u << 11,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE | TRANS | INLINE_ASM,
+ DS_READ | DS_WRITE | TRANS,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -2392,6 +2391,16 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
if (MI.isMetaInstruction())
Result = false;
+ else if (MI.isInlineAsm()) {
+ std::string Text = MI.getOperand(0).getSymbolName();
+ if (Text.find("SGMASK:") != std::string::npos) {
+ Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
+ Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
+ unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0);
+ Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
+ }
+ }
+
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
@@ -2437,10 +2446,6 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
TII->isTRANS(MI))
Result = true;
- else if (((SGMask & SchedGroupMask::INLINE_ASM) != SchedGroupMask::NONE) &&
- MI.isInlineAsm())
- Result = true;
-
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
>From 238b3facebfe2573d73f9d426d2313e8f38c556a Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Thu, 28 Aug 2025 11:17:43 -0500
Subject: [PATCH 3/5] Switch to StringRef
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 8c514714bd7dd..fedc1e73b4123 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2392,11 +2392,11 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = false;
else if (MI.isInlineAsm()) {
- std::string Text = MI.getOperand(0).getSymbolName();
+ StringRef Text = MI.getOperand(0).getSymbolName();
if (Text.find("SGMASK:") != std::string::npos) {
Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
Text = Text.substr(0, Text.find_first_of(" \t\r\n"));
- unsigned long InlineAsmMask = std::stoul(Text, nullptr, 0);
+ unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
}
}
>From 6db71f799b29a386dacf1a097f33c69891d5d758 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Fri, 29 Aug 2025 17:35:22 -0500
Subject: [PATCH 4/5] Add testcase
---
llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 71 ++++++++++++++++++++
1 file changed, 71 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
new file mode 100644
index 0000000000000..2f35640226fd6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -0,0 +1,71 @@
+; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s
+; CHECK: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMEND
+; CHECK: v_add_f32_e32
+; ModuleID = '<stdin>'
+source_filename = "llvm-link"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+ at llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata"
+ at __hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0
+
+; Function Attrs: convergent mustprogress norecurse nounwind
+define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+entry:
+ %0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %mul = shl i32 %0, 6
+ %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %1
+ %cmp = icmp slt i32 %add, %length
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %idx.ext = sext i32 %add to i64
+ %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+ %mul3 = shl nsw i32 %add, 2
+ %idx.ext4 = sext i32 %mul3 to i64
+ %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
+ %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
+ %3 = extractelement <4 x float> %2, i64 3
+ %4 = extractelement <4 x float> %2, i64 0
+ %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3
+ %6 = extractelement <4 x float> %2, i64 1
+ %7 = extractelement <4 x float> %2, i64 2
+ %add6 = fadd contract float %6, %7
+ %add7 = fadd contract float %5, %add6
+ store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: convergent nocallback nofree nounwind willreturn
+declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nofree nounwind willreturn }
+attributes #3 = { convergent nounwind memory(none) }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"omnipotent char", !2, i64 0}
+!2 = !{!"Simple C++ TBAA"}
+!3 = !{i64 129}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"float", !1, i64 0}
>From 43d23ab781d4fa7b4cb7614ccc096f3108b66739 Mon Sep 17 00:00:00 2001
From: Patrick Simmons <psimmons at pensando.io>
Date: Wed, 1 Oct 2025 15:25:02 -0500
Subject: [PATCH 5/5] Guess the constraints instead of using user-provided
hints
---
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 43 ++++++++++++++++++++
llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll | 17 ++++----
2 files changed, 53 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fedc1e73b4123..5d352f16048f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2392,6 +2392,48 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = false;
else if (MI.isInlineAsm()) {
+ auto &TRI = TII->getRegisterInfo();
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ bool SGPR_used = false, VGPR_used = false, VMFMA_used = false,
+ MayLoad = MI.mayLoad(), MayStore = MI.mayStore();
+ for (const MachineOperand &Operand : MI.operands())
+ if (Operand.isReg()) {
+ auto &RegClass = *TRI.getRegClassForOperandReg(MRI, Operand);
+ if (TRI.isVGPRClass(&RegClass))
+ VGPR_used = true;
+ if (TRI.isAGPRClass(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+ VMFMA_used = true;
+ if (TRI.isSGPRClass(&RegClass))
+ SGPR_used = true;
+ }
+
+ unsigned long InlineAsmMask = 0;
+ if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::VALU;
+ if (VMFMA_used)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::MFMA;
+ if (VGPR_used && MayLoad)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_READ;
+ if (VGPR_used && MayStore)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM_WRITE;
+ if (!VGPR_used && MayLoad)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::DS_READ;
+ if (!VGPR_used && MayStore)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::DS_WRITE;
+ if (InlineAsmMask & (unsigned long)SchedGroupMask::VALU ||
+ InlineAsmMask & (unsigned long)SchedGroupMask::SALU)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::ALU;
+ if (InlineAsmMask & (unsigned long)SchedGroupMask::DS_READ ||
+ InlineAsmMask & (unsigned long)SchedGroupMask::DS_WRITE)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::DS;
+ if (InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_READ ||
+ InlineAsmMask & (unsigned long)SchedGroupMask::VMEM_WRITE)
+ InlineAsmMask |= (unsigned long)SchedGroupMask::VMEM;
+
+ Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
+
+ // Original implementation
+#if 0
StringRef Text = MI.getOperand(0).getSymbolName();
if (Text.find("SGMASK:") != std::string::npos) {
Text = Text.substr(Text.find("SGMASK:") + strlen("SGMASK:"));
@@ -2399,6 +2441,7 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
unsigned long InlineAsmMask = std::stoul(Text.str(), nullptr, 0);
Result = ((unsigned long)SGMask & InlineAsmMask) != 0;
}
+#endif
}
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
index 2f35640226fd6..89bf6ad677545 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -28,12 +28,15 @@ if.then: ; preds = %entry
%mul3 = shl nsw i32 %add, 2
%idx.ext4 = sext i32 %mul3 to i64
%add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
- %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
- %3 = extractelement <4 x float> %2, i64 3
- %4 = extractelement <4 x float> %2, i64 0
- %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3
- %6 = extractelement <4 x float> %2, i64 1
- %7 = extractelement <4 x float> %2, i64 2
+ %2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0
+ %a20 = add i64 %idx.ext4, 2
+ %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
+ %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16, !tbaa !0
+ %3 = extractelement <2 x float> %a22, i64 1
+ %4 = extractelement <2 x float> %2, i64 0
+ %5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %2, <2 x float> %a22) #3, !srcloc !3
+ %6 = extractelement <2 x float> %2, i64 1
+ %7 = extractelement <2 x float> %a22, i64 0
%add6 = fadd contract float %6, %7
%add7 = fadd contract float %5, %add6
store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4
@@ -41,7 +44,7 @@ if.then: ; preds = %entry
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
- tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
br label %if.end
More information about the llvm-commits
mailing list