[llvm] [AMDGPU] Add s_delay_alu in relocated PC-relative symbol sequence (PR #71061)

via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 2 06:47:43 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

Author: Jay Foad (jayfoad)

<details>
<summary>Changes</summary>

Add an s_delay_alu in this sequence to account for the single cycle
delay before s0 can be used:

  s_getpc_b64 s[0:1]
  s_delay_alu instid0(SALU_CYCLE_1)
  s_add_u32 s0, s0, symbol@<!-- -->lo+8
  s_addc_u32 s1, s1, symbol@<!-- -->hi+16

AMDGPUInsertDelayAlu will not add insert this s_delay_alu instruction
automatically because it cannot modify the contents of bundles.


---

Patch is 109.21 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71061.diff


18 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp (+2) 
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+11-2) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll (+9-6) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+12-8) 
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+165-110) 
- (modified) llvm/test/CodeGen/AMDGPU/calling-conventions.ll (+6-4) 
- (modified) llvm/test/CodeGen/AMDGPU/cc-update.ll (+12-8) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+60-40) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+36-24) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+36-24) 
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+60-40) 
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+9-6) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.buffer.load.ll (+3-2) 
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+24-16) 
- (modified) llvm/test/CodeGen/AMDGPU/propagate-attributes-bitcast-function.ll (+8-4) 
- (modified) llvm/test/CodeGen/AMDGPU/rel32.ll (+8-5) 
- (modified) llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll (+6-4) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll (+6-4) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index 7619a39bac9c142..b417d8502603f30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -342,6 +342,8 @@ class AMDGPUInsertDelayAlu : public MachineFunctionPass {
 
     // Iterate over the contents of bundles, but don't emit any instructions
     // inside a bundle.
+    // TODO: If the bundle already contains an s_delay_alu instruction, remember
+    // it in LastDelayAlu so we can try encoding another delay in it?
     for (auto &MI : MBB.instrs()) {
       if (MI.isBundle() || MI.isMetaInstruction())
         continue;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f61735a59c97078..2743198766d578d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2405,13 +2405,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
     // instruction.
 
+    int64_t Adjust = 0;
+    if (ST.hasDelayAlu()) {
+      // Manually add the 1 cycle delay before using RegLo. AMDGPUInsertDelayAlu
+      // will not add this automatically inside a bundle:
+      //   s_delay_alu instid0(SALU_CYCLE_1)
+      Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_DELAY_ALU)).addImm(9));
+      Adjust += 4;
+    }
+
     if (OpLo.isGlobal())
-      OpLo.setOffset(OpLo.getOffset() + 4);
+      OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
     Bundler.append(
         BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
 
     if (OpHi.isGlobal())
-      OpHi.setOffset(OpHi.getOffset() + 12);
+      OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                        .addReg(RegHi)
                        .add(OpHi));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 96db1f889690df6..c31f4e48c672b30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -117,8 +117,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
@@ -252,8 +253,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
@@ -395,8 +397,9 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_add_i32 s32, s32, 64
 ; GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
-; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s0, gv at gotpcrel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s1, s1, gv at gotpcrel32@hi+16
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s33, s2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
index 7beaf3103586375..b64da0ff7c4b7e9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll
@@ -36,8 +36,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+16
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v4, v8 :: v_dual_mov_b32 v5, v9
 ; GISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v6, v10 :: v_dual_mov_b32 v7, v11
@@ -72,8 +73,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_simple_call(<4 x i32> inreg %sgpr,
 ; DAGISEL-GFX11:       ; %bb.0:
 ; DAGISEL-GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[4:5]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT:    s_add_u32 s4, s4, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s5, s5, use at gotpcrel32@hi+16
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v7, v11 :: v_dual_mov_b32 v6, v10
 ; DAGISEL-GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v5, v9 :: v_dual_mov_b32 v4, v8
@@ -141,8 +143,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v28, s24
 ; GISEL-GFX11-NEXT:    scratch_store_b32 off, v29, s25
 ; GISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; GISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; GISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; GISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; GISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+16
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
 ; GISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
 ; GISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
@@ -272,8 +275,9 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_spill(<24 x i32> inreg %sgprs, <24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v20, s24
 ; DAGISEL-GFX11-NEXT:    scratch_store_b32 off, v19, s25
 ; DAGISEL-GFX11-NEXT:    s_getpc_b64 s[24:25]
-; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+4
-; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+12
+; DAGISEL-GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; DAGISEL-GFX11-NEXT:    s_add_u32 s24, s24, use at gotpcrel32@lo+8
+; DAGISEL-GFX11-NEXT:    s_addc_u32 s25, s25, use at gotpcrel32@hi+16
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v32, v15 :: v_dual_mov_b32 v33, v14
 ; DAGISEL-GFX11-NEXT:    s_load_b64 s[24:25], s[24:25], 0x0
 ; DAGISEL-GFX11-NEXT:    v_dual_mov_b32 v34, v13 :: v_dual_mov_b32 v35, v12
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 4e2b83af7f5e20b..f2aa24c1a7f7175 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -124,8 +124,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -228,8 +229,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+16
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -338,8 +340,9 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+16
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -432,8 +435,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -534,8 +538,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -639,8 +644,9 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -732,8 +738,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -833,8 +840,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -938,8 +946,9 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1031,8 +1040,9 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1123,8 +1133,9 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1229,8 +1240,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1333,8 +1345,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1448,8 +1461,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1576,8 +1590,9 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1675,8 +1690,9 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1764,8 +1780,9 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1856,8 +1873,9 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+8
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+16
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_endpgm
@@ -1953,8 +1971,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+8
+; GFX11-N...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/71061


More information about the llvm-commits mailing list