[llvm] 0d0249e - Try To Guess SGMasks for Inline Asm Instructions (#155491)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 23 10:43:59 PST 2026
Author: Patrick Simmons
Date: 2026-01-23T12:43:53-06:00
New Revision: 0d0249e543c91bf2f55c4e123ae8c31d3fabb77a
URL: https://github.com/llvm/llvm-project/commit/0d0249e543c91bf2f55c4e123ae8c31d3fabb77a
DIFF: https://github.com/llvm/llvm-project/commit/0d0249e543c91bf2f55c4e123ae8c31d3fabb77a.diff
LOG: Try To Guess SGMasks for Inline Asm Instructions (#155491)
Addresses SWDEV-549227
Added:
llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 8340405c7c9ed..10ffbe281beac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -24,6 +24,8 @@
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/TargetOpcodes.h"
+#include <type_traits>
+
using namespace llvm;
#define DEBUG_TYPE "igrouplp"
@@ -2391,6 +2393,61 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
if (MI.isMetaInstruction())
Result = false;
+ else if (MI.isInlineAsm()) {
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+ auto &MRI = MI.getParent()->getParent()->getRegInfo();
+ bool SGPR_used = false, SGPR_big_def = false, VGPR_used = false,
+ VMFMA_used = false, VReg32_used = false, MayLoad = MI.mayLoad(),
+ MayStore = MI.mayStore();
+ for (const MachineOperand &Operand : MI.operands())
+ if (Operand.isReg()) {
+ const TargetRegisterClass &RegClass =
+ *TRI.getRegClassForOperandReg(MRI, Operand);
+ if (TRI.hasVGPRs(&RegClass)) {
+ VGPR_used = true;
+ if (Operand.isUse() && TRI.getRegSizeInBits(RegClass) == 32)
+ VReg32_used = true;
+ }
+ // > 128 bit registers are usually only used by MFMA instructions, so
+ // we're using that as a heuristic to guess the schedule group mask of
+ // the inline asm.
+ if (TRI.hasAGPRs(&RegClass) || TRI.getRegSizeInBits(RegClass) > 128)
+ VMFMA_used = true;
+ if (TRI.hasSGPRs(&RegClass))
+ SGPR_used = true;
+ if (TRI.getRegSizeInBits(RegClass) > 64 && Operand.isDef())
+ SGPR_big_def = true;
+ }
+
+ typedef std::underlying_type_t<SchedGroupMask> SGMask_t;
+ SGMask_t InlineAsmMask = 0;
+ if (VGPR_used && !VMFMA_used && !MayLoad && !MayStore)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::VALU;
+ if (SGPR_used && !VGPR_used && !MayLoad && !MayStore)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::SALU;
+ if (VMFMA_used)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::MFMA;
+ if (VGPR_used && MayLoad)
+ InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_READ
+ : SchedGroupMask::VMEM_READ);
+ if (VGPR_used && MayStore)
+ InlineAsmMask |= (SGMask_t)(VReg32_used ? SchedGroupMask::DS_WRITE
+ : SchedGroupMask::VMEM_WRITE);
+ if (SGPR_big_def)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::DS_READ;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::VALU ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::SALU)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::ALU;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::DS_READ ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::DS_WRITE)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::DS;
+ if (InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_READ ||
+ InlineAsmMask & (SGMask_t)SchedGroupMask::VMEM_WRITE)
+ InlineAsmMask |= (SGMask_t)SchedGroupMask::VMEM;
+
+ Result = ((SGMask_t)SGMask & InlineAsmMask) != 0;
+ }
+
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
new file mode 100644
index 0000000000000..da692f1a557e7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-sgmask.ll
@@ -0,0 +1,179 @@
+; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
+target triple = "amdgcn-amd-amdhsa"
+
+define protected amdgpu_kernel void @test_valu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_valu
+; CHECK: s_mul_i32
+; CHECK: ASMSTART
+entry:
+ %a0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %mul = shl i32 %a0, 6
+ %a1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %a1
+ %cmp = icmp slt i32 %add, %length
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %idx.ext = sext i32 %add to i64
+ %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+ %mul4 = shl nsw i32 %add, 2
+ %idx.ext5 = sext i32 %mul4 to i64
+ %add.ptr6 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext5
+ %a2 = load <4 x float>, ptr addrspace(1) %add.ptr6, align 16
+ %a3 = extractelement <4 x float> %a2, i64 3
+ %a4 = extractelement <4 x float> %a2, i64 0
+ %a5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2", "=v,v,v"(float %a3, float %a4) #3
+ %a6 = extractelement <4 x float> %a2, i64 1
+ %a7 = extractelement <4 x float> %a2, i64 2
+ %add7 = fadd contract float %a6, %a7
+ %add8 = fadd contract float %a5, %add7
+ store float %add8, ptr addrspace(1) %add.ptr, align 4
+ %mul9 = mul nsw i32 %k, 3
+ store i32 %mul9, ptr addrspace(1) %ret.coerce, align 4
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 7, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 4, i32 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+define protected amdgpu_kernel void @test_salu(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %k, ptr addrspace(1) noundef writeonly captures(none) %ret.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_salu
+; CHECK: %bb.1
+; CHECK-NEXT: s_load
+; CHECK-NEXT: s_load
+; CHECK-NEXT: s_waitcnt
+; CHECK-NEXT: ASMSTART
+entry:
+ %a0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %mul = shl i32 %a0, 6
+ %a1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %a1
+ %cmp = icmp slt i32 %add, %length
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %idx.ext = sext i32 %add to i64
+ %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+ %mul4 = shl nsw i32 %add, 2
+ %idx.ext5 = sext i32 %mul4 to i64
+ %add.ptr6 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext5
+ %a2 = load <4 x float>, ptr addrspace(1) %add.ptr6, align 16
+ %a3 = extractelement <4 x float> %a2, i64 3
+ %a4 = extractelement <4 x float> %a2, i64 0
+ %a5 = fadd contract float %a3, %a4
+ %a6 = extractelement <4 x float> %a2, i64 1
+ %a7 = extractelement <4 x float> %a2, i64 2
+ %add7 = fadd contract float %a6, %a7
+ %add8 = fadd contract float %a5, %add7
+ store float %add8, ptr addrspace(1) %add.ptr, align 4
+ %mul9 = tail call noundef i32 asm "s_mul_i32, $0, $1, 3", "=s,s"(i32 %k) #3
+ store i32 %mul9, ptr addrspace(1) %ret.coerce, align 4
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 4, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 10, i32 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+define protected amdgpu_kernel void @test_mfma(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mfma
+; CHECK: v_add_f32_e32
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_mfma_f64
+; CHECK-NEXT: ;;#ASMEND
+; CHECK: v_add_f32_e32
+entry:
+ %a0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %mul = shl i32 %a0, 6
+ %a1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %a1
+ %cmp = icmp slt i32 %add, %length
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %idx.ext = sext i32 %add to i64
+ %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+ %mul3 = shl nsw i32 %add, 2
+ %idx.ext4 = sext i32 %mul3 to i64
+ %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
+ %a2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
+ %a20 = add i64 %idx.ext4, 2
+ %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
+ %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
+ %a3 = extractelement <2 x float> %a22, i64 1
+ %a4 = extractelement <2 x float> %a2, i64 0
+ %a5 = tail call contract noundef float asm "v_mfma_f64_4x4x4f64 $0, $1, $2, 0", "=a,v,v"(<2 x float> %a2, <2 x float> %a22) #3
+ %a6 = extractelement <2 x float> %a2, i64 1
+ %a7 = extractelement <2 x float> %a22, i64 0
+ %add6 = fadd contract float %a6, %a7
+ %add7 = fadd contract float %a5, %add6
+ store float %add7, ptr addrspace(1) %add.ptr, align 4
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 9, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+ at _ZZ16group4_sum_floatPfPKfE6cpymem = internal addrspace(3) global [8 x float] undef, align 16
+
+define protected amdgpu_kernel void @test_ds(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 {
+; CHECK-LABEL: test_ds
+; CHECK-DAG: v_lshl_add_u64
+; CHECK-DAG: v_add_f32_e32
+; CHECK-NEXT: ASMSTART
+entry:
+ %a0 = tail call i32 @llvm.amdgcn.workgroup.id.x()
+ %mul = shl i32 %a0, 6
+ %a1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %add = add i32 %mul, %a1
+ %cmp = icmp slt i32 %add, %length
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then: ; preds = %entry
+ %idx.ext = sext i32 %add to i64
+ %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext
+ %mul3 = shl nsw i32 %add, 2
+ %idx.ext4 = sext i32 %mul3 to i64
+ %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4
+ %a2 = load <2 x float>, ptr addrspace(1) %add.ptr5, align 16
+ %a20 = add i64 %idx.ext4, 2
+ %a21 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %a20
+ %a22 = load <2 x float>, ptr addrspace(1) %a21, align 16
+ %a3 = extractelement <2 x float> %a22, i64 1
+ %a4 = extractelement <2 x float> %a2, i64 0
+ %a5 = tail call contract noundef float asm "ds_read_b32 $0, $1 offset:0", "=v,v,~{memory}"(i32 ptrtoint (ptr addrspacecast (ptr addrspace(3) @_ZZ16group4_sum_floatPfPKfE6cpymem to ptr) to i32)) #4
+ %a6 = extractelement <2 x float> %a2, i64 1
+ %a7 = extractelement <2 x float> %a22, i64 0
+ %add6 = fadd contract float %a6, %a7
+ %add7 = fadd contract float %a5, %add6
+ store float %add7, ptr addrspace(1) %add.ptr, align 4
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 11, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 128, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0)
+ br label %if.end
+
+if.end: ; preds = %if.then, %entry
+ ret void
+}
+
+declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1
+
+declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
+
+declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2
+
+attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="1" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { convergent nocallback nofree nounwind willreturn }
+attributes #3 = { convergent nounwind memory(none) }
+attributes #4 = { convergent nounwind }
More information about the llvm-commits
mailing list