[llvm] [AMDGPU] Add inreg support for SGPR arguments (PR #67182)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 22 12:19:18 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
Function parameters marked with inreg are supposed to be allocated to SGPRs. However, for compute functions, this is ignored and function parameters are allocated to VGPRs. This fix modifies CC_AMDGPU_Func in AMDGPUCallingConv.td to use SGPRs if input arg is marked inreg. Correspondingly, the function check_saveexec_overwrites_vcmp_source in testcase vcmp-saveexec-to-vcmpx.ll is duplicate such that one has inreg and the other does not.
---
Full diff: https://github.com/llvm/llvm-project/pull/67182.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td (+5)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+3)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+2-2)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll (+21)
- (modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll (-2)
- (added) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+359)
- (modified) llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll (+35-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4d7090942142f3c..4b43fbecaf523ad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -183,6 +183,11 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
+ >>>,
+
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 591775dbf45e396..5dc9c6427dd54b2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2565,6 +2565,9 @@ SDValue SITargetLowering::LowerFormalArguments(
analyzeFormalArgumentsCompute(CCInfo, Ins);
} else {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+ if (!IsGraphics && !IsKernel && !Subtarget->enableFlatScratch()) {
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, AMDGPU::SGPR2, AMDGPU::SGPR3}, 4);
+ }
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f624cc53a952082..62c16e68fcf78d6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2539,7 +2539,7 @@ bool isArgPassedInSGPR(const Argument *A) {
A->hasAttribute(Attribute::ByVal);
default:
// TODO: Should calls support inreg for SGPR inputs?
- return false;
+ return A->hasAttribute(Attribute::InReg);
}
}
@@ -2566,7 +2566,7 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
CB->paramHasAttr(ArgNo, Attribute::ByVal);
default:
// TODO: Should calls support inreg for SGPR inputs?
- return false;
+ return CB->paramHasAttr(ArgNo, Attribute::InReg);
}
}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
index 48528c6112b00ef..2e5a3e998ed89d4 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -39,6 +39,15 @@ define i32 @asm_sgpr(i32 %divergent) {
ret i32 %sgpr
}
+; SGPR asm outputs are uniform regardless of the input operands.
+; Argument not divergent if marked inreg.
+; CHECK-LABEL: for function 'asm_sgpr_inreg_arg':
+; CHECK-NOT: DIVERGENT
+define i32 @asm_sgpr_inreg_arg(i32 inreg %divergent) {
+ %sgpr = call i32 asm "; def $0, $1","=s,v"(i32 %divergent)
+ ret i32 %sgpr
+}
+
; CHECK-LABEL: for function 'asm_mixed_sgpr_vgpr':
; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1, $2", "=s,=v,v"(i32 %divergent)
; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
@@ -58,6 +67,18 @@ define void @single_lane_func_arguments(i32 %i32, i1 %i1) #2 {
ret void
}
+; CHECK-LABEL: for function 'divergent_args':
+; CHECK: DIVERGENT ARGUMENTS
+define void @divergent_args(i32 %i32, i1 %i1) {
+ ret void
+}
+
+; CHECK-LABEL: for function 'no_divergent_args_if_inreg':
+; CHECK-NOT: DIVERGENT ARGUMENTS
+define void @no_divergent_args_if_inreg(i32 inreg %i32, i1 inreg %i1) {
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #1
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
index 4b014f969257bdb..f6fe654896a2de9 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
@@ -30,8 +30,6 @@ define amdgpu_kernel void @test_amdgpu_kernel(ptr addrspace(4) byref([4 x <16 x
; CHECK: DIVERGENT:
; CHECK: DIVERGENT:
; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
define void @test_c(ptr addrspace(5) byval([4 x <16 x i8>]) %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
new file mode 100644
index 000000000000000..7750fb3316c1c82
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -0,0 +1,359 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
+define void @void_func_i1(i1 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i1_inreg(i1 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1_inreg:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_and_b32 s4, s4, 1
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_and_b32 s0, s0, 1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i1 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i8(i8 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i8:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i8 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i8_inreg(i8 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i8_inreg:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i8 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i16(i16 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i16:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i16 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i16_inreg(i16 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i16_inreg:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i16 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i32(i32 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i32:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i32 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_i32_inreg(i32 inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_i32_inreg:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store i32 %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_f16(half %arg0) #0 {
+; GFX89-LABEL: void_func_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store half %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_f16_inreg(half inreg %arg0) #0 {
+; GFX89-LABEL: void_func_f16_inreg:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store half %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_f32(float %arg0) #0 {
+; CIGFX89-LABEL: void_func_f32:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store float %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_f32_inreg(float inreg %arg0) #0 {
+; CIGFX89-LABEL: void_func_f32_inreg:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: v_mov_b32_e32 v0, s4
+; CIGFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store float %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v2i16(<2 x i16> %arg0) #0 {
+; GFX89-LABEL: void_func_v2i16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x i16> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v2i16_inreg(<2 x i16> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v2i16_inreg:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x i16> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v2f16(<2 x half> %arg0) #0 {
+; GFX89-LABEL: void_func_v2f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x half> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
+define void @void_func_v2f16_inreg(<2 x half> inreg %arg0) #0 {
+; GFX89-LABEL: void_func_v2f16_inreg:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_waitcnt vmcnt(0)
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ store <2 x half> %arg0, ptr addrspace(1) undef
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
index 9dcd3a66a16dbf0..14d13bbee28e0ff 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx.ll
@@ -133,15 +133,48 @@ l2:
; Omit the transformation if the s_and_saveexec instruction overwrites
; any of the v_cmp source operands.
+
; GCN-LABEL: check_saveexec_overwrites_vcmp_source:
-; GCN: .LBB7_3: ; %then
+; GCN: .LBB7_2: ; %then
+; GFX1010: v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1010-NEXT: s_cmp_ge_i32 s[[C]], s[[B]]
+; GFX1030: v_cmp_eq_u32_e64 s[[C:[0-9]+]], s[[A:[0-9]+]], s[[B:[0-9]+]]
+; GFX1030-NEXT: s_cmp_ge_i32 s[[C]], s[[B]]
+define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+entry:
+ %0 = icmp sge i32 %a, 0
+ br i1 %0, label %if, label %then
+
+if:
+ %1 = shl i32 %a, 2
+ %2 = or i32 %1, %b
+ ret i32 %2
+
+then:
+ %3 = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+ %4 = trunc i64 %3 to i32
+ %5 = icmp slt i32 %4, %b
+ br i1 %5, label %after, label %end
+
+after:
+ ret i32 %4
+
+end:
+ ret i32 %a
+}
+
+; Omit the transformation if the s_and_saveexec instruction overwrites
+; any of the v_cmp source operands.
+
+; GCN-LABEL: check_saveexec_overwrites_vcmp_source_no_inreg:
+; GCN: .LBB8_3: ; %then
; GFX1010: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
; GFX1010-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
; GFX1010-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
; GFX1030: v_cmp_ge_i32_e32 vcc_lo, s[[A:[0-9]+]], v{{.*}}
; GFX1030-NEXT: v_mov_b32_e32 {{.*}}, s[[A]]
; GFX1030-NEXT: s_and_saveexec_b32 s[[A]], vcc_lo
-define i32 @check_saveexec_overwrites_vcmp_source(i32 inreg %a, i32 inreg %b) {
+define i32 @check_saveexec_overwrites_vcmp_source_no_inreg(i32 %a, i32 %b) {
entry:
%0 = icmp sge i32 %a, 0
br i1 %0, label %if, label %then
``````````
</details>
https://github.com/llvm/llvm-project/pull/67182
More information about the llvm-commits
mailing list