[llvm] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol sequence (PR #71061)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 06:47:43 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
Add an s_delay_alu in this sequence to account for the single cycle
delay before s0 can be used:
s_getpc_b64 s[0:1]
s_delay_alu instid0(SALU_CYCLE_1)
s_add_u32 s0, s0, symbol@<!-- -->lo+8
s_addc_u32 s1, s1, symbol@<!-- -->hi+16
AMDGPUInsertDelayAlu will not add insert this s_delay_alu instruction
automatically because it cannot modify the contents of bundles.
---
Patch is 109.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71061.diff
18 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (+2)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+11-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+12-8)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+165-110)
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+6-4)
- (modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+12-8)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+60-40)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+36-24)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+36-24)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+60-40)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+9-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+3-2)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+24-16)
- (modified) llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/rel32.ll (+8-5)
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+6-4)
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll (+6-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 7619a39bac9c142..b417d8502603f30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -342,6 +342,8 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
// Iterate over the contents of bundles, but don't emit any instructions
// inside a bundle.
+ // TODO: If the bundle already contains an s_delay_alu instruction, remember
+ // it in LastDelayAlu so we can try encoding another delay in it?
for (auto &MI : MBB.instrs()) {
if (MI.isBundle() || MI.isMetaInstruction())
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f61735a59c97078..2743198766d578d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2405,13 +2405,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// the encoding of $symbol starts 12 bytes after the start of the s_add_u32
// instruction.
+ int64_t Adjust = 0;
+ if (ST.hasDelayAlu()) {
+ // Manually add the 1 cycle delay before using RegLo. AMDGPUInsertDelayAlu
+ // will not add this automatically inside a bundle:
+ // s_delay_alu instid0(SALU_CYCLE_1)
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_DELAY_ALU)).addImm(9));
+ Adjust += 4;
+ }
+
if (OpLo.isGlobal())
- OpLo.setOffset(OpLo.getOffset() + 4);
+ OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
Bundler.append(
BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
if (OpHi.isGlobal())
- OpHi.setOffset(OpHi.getOffset() + 12);
+ OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi)
.add(OpHi));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 96db1f889690df6..c31f4e48c672b30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -117,8 +117,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
@@ -252,8 +253,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
; GFX11-NEXT: s_mov_b32 s33, s32
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
@@ -395,8 +397,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
; GFX11-NEXT: s_add_i32 s32, s32, 64
; GFX11-NEXT: s_and_not1_b32 s33, s33, 31
; GFX11-NEXT: s_getpc_b64 s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT: s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_mov_b32 s33, s2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..b64da0ff7c4b7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -36,8 +36,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX11-NEXT: s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+16
; GISEL-GFX11-NEXT: v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
; GISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; GISEL-GFX11-NEXT: v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
@@ -72,8 +73,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
; DAGISEL-GFX11: ; %bb.0:
; DAGISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX11-NEXT: s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT: s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT: s_addc_u32 s5, s5, use at gotpcrel32@hi+16
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
; DAGISEL-GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
@@ -141,8 +143,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; GISEL-GFX11-NEXT: scratch_store_b32 off, v28, s24
; GISEL-GFX11-NEXT: scratch_store_b32 off, v29, s25
; GISEL-GFX11-NEXT: s_getpc_b64 s[24:25]
-; GISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+16
; GISEL-GFX11-NEXT: v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
; GISEL-GFX11-NEXT: s_load_b64 s[24:25], s[24:25], 0x0
; GISEL-GFX11-NEXT: v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
@@ -272,8 +275,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v20, s24
; DAGISEL-GFX11-NEXT: scratch_store_b32 off, v19, s25
; DAGISEL-GFX11-NEXT: s_getpc_b64 s[24:25]
-; DAGISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT: s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT: s_addc_u32 s25, s25, use at gotpcrel32@hi+16
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
; DAGISEL-GFX11-NEXT: s_load_b64 s[24:25], s[24:25], 0x0
; DAGISEL-GFX11-NEXT: v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..f2aa24c1a7f7175 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -124,8 +124,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -228,8 +229,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+16
; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -338,8 +340,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+16
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -432,8 +435,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -534,8 +538,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -639,8 +644,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -732,8 +738,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -833,8 +840,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -938,8 +946,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1031,8 +1040,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1123,8 +1133,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1229,8 +1240,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1333,8 +1345,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1448,8 +1461,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1576,8 +1590,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1675,8 +1690,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1764,8 +1780,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1856,8 +1873,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+8
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
; GFX11-NEXT: s_endpgm
@@ -1953,8 +1971,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1]
; GFX11-NEXT: s_mov_b32 s32, 0
; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+8
+; GFX11-N...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/71061
More information about the llvm-commits
mailing list